aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir2772
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir2496
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll891
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir40
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll240
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir121
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir24
-rw-r--r--llvm/test/CodeGen/AMDGPU/bf16-math.ll142
-rw-r--r--llvm/test/CodeGen/AMDGPU/bf16.ll119
-rw-r--r--llvm/test/CodeGen/AMDGPU/build_vector.ll28
-rw-r--r--llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll240
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll3195
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptrunc.ll1173
-rw-r--r--llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll100
-rw-r--r--llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll95
-rw-r--r--llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll88
-rw-r--r--llvm/test/CodeGen/AMDGPU/hard-clauses-gfx1250.mir33
-rw-r--r--llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir178
-rw-r--r--llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir90
-rw-r--r--llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir32
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/large-avgpr-assign-last.mir94
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll100
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll100
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll40
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll61
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll201
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll587
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll1856
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll4953
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll328
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll2278
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll183
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll1238
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll174
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll878
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll147
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll270
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll568
-rw-r--r--llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll370
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll634
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll167
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll512
-rw-r--r--llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll48
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll48
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll1074
-rw-r--r--llvm/test/CodeGen/AMDGPU/packed-fp32.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll51
-rw-r--r--llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll42
-rw-r--r--llvm/test/CodeGen/AMDGPU/read-write-register-illegal-type.ll29
-rw-r--r--llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll36
-rw-r--r--llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll106
-rw-r--r--llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll9
-rw-r--r--llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-agpr.ll482
-rw-r--r--llvm/test/CodeGen/AMDGPU/ssubo.ll676
-rw-r--r--llvm/test/CodeGen/AMDGPU/structurize-hoist.ll180
-rw-r--r--llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll120
-rw-r--r--llvm/test/CodeGen/AMDGPU/uaddo.ll553
-rw-r--r--llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll47
-rw-r--r--llvm/test/CodeGen/AMDGPU/usubo.ll499
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll258
-rw-r--r--llvm/test/CodeGen/AMDGPU/wwm-regalloc-error.ll2
75 files changed, 25505 insertions, 6752 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir
index 789385d..b770d43 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir
@@ -1,12 +1,8 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner %s -o - | FileCheck -check-prefix=GFX9 %s
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner -fp-contract=fast %s -o - | FileCheck -check-prefix=GFX9-CONTRACT %s
# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner --denormal-fp-math=preserve-sign %s -o - | FileCheck -check-prefix=GFX9-DENORM %s
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner -enable-unsafe-fp-math %s -o - | FileCheck -check-prefix=GFX9-UNSAFE %s
# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner %s -o - | FileCheck -check-prefix=GFX10 %s
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -fp-contract=fast %s -o - | FileCheck -check-prefix=GFX10-CONTRACT %s
# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner --denormal-fp-math=preserve-sign %s -o - | FileCheck -check-prefix=GFX10-DENORM %s
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -enable-unsafe-fp-math %s -o - | FileCheck -check-prefix=GFX10-UNSAFE %s
---
name: test_f32_add_mul
@@ -24,15 +20,7 @@ body: |
; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[COPY2]]
; GFX9-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX9-CONTRACT-LABEL: name: test_f32_add_mul
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
; GFX9-DENORM-LABEL: name: test_f32_add_mul
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX9-DENORM-NEXT: {{ $}}
@@ -43,15 +31,7 @@ body: |
; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[COPY2]]
; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX9-UNSAFE-LABEL: name: test_f32_add_mul
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
; GFX10-LABEL: name: test_f32_add_mul
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-NEXT: {{ $}}
@@ -62,15 +42,7 @@ body: |
; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[COPY2]]
; GFX10-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX10-CONTRACT-LABEL: name: test_f32_add_mul
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
; GFX10-DENORM-LABEL: name: test_f32_add_mul
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-DENORM-NEXT: {{ $}}
@@ -81,15 +53,6 @@ body: |
; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[COPY2]]
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX10-UNSAFE-LABEL: name: test_f32_add_mul
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s32) = COPY $vgpr2
@@ -100,6 +63,60 @@ body: |
...
---
+name: test_f32_add_mul_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; GFX9-LABEL: name: test_f32_add_mul_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX9-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX9-DENORM-LABEL: name: test_f32_add_mul_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-LABEL: name: test_f32_add_mul_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-DENORM-LABEL: name: test_f32_add_mul_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(s32) = COPY $vgpr2
+ %4:_(s32) = contract G_FMUL %0, %1
+ %5:_(s32) = contract G_FADD %4, %2
+ $vgpr0 = COPY %5(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+...
+
+---
name: test_f32_add_mul_rhs
body: |
bb.1.entry:
@@ -115,15 +132,7 @@ body: |
; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY2]], [[FMUL]]
; GFX9-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX9-CONTRACT-LABEL: name: test_f32_add_mul_rhs
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
; GFX9-DENORM-LABEL: name: test_f32_add_mul_rhs
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX9-DENORM-NEXT: {{ $}}
@@ -134,15 +143,7 @@ body: |
; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY2]], [[FMUL]]
; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX9-UNSAFE-LABEL: name: test_f32_add_mul_rhs
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
; GFX10-LABEL: name: test_f32_add_mul_rhs
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-NEXT: {{ $}}
@@ -153,15 +154,7 @@ body: |
; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY2]], [[FMUL]]
; GFX10-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX10-CONTRACT-LABEL: name: test_f32_add_mul_rhs
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
; GFX10-DENORM-LABEL: name: test_f32_add_mul_rhs
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-DENORM-NEXT: {{ $}}
@@ -172,15 +165,6 @@ body: |
; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY2]], [[FMUL]]
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX10-UNSAFE-LABEL: name: test_f32_add_mul_rhs
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s32) = COPY $vgpr2
@@ -191,6 +175,60 @@ body: |
...
---
+name: test_f32_add_mul_rhs_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; GFX9-LABEL: name: test_f32_add_mul_rhs_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX9-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX9-DENORM-LABEL: name: test_f32_add_mul_rhs_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-LABEL: name: test_f32_add_mul_rhs_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-DENORM-LABEL: name: test_f32_add_mul_rhs_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(s32) = COPY $vgpr2
+ %4:_(s32) = contract G_FMUL %0, %1
+ %5:_(s32) = contract G_FADD %2, %4
+ $vgpr0 = COPY %5(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+...
+
+---
name: test_add_mul_multiple_defs_z
body: |
bb.1.entry:
@@ -209,18 +247,7 @@ body: |
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV1]]
; GFX9-NEXT: $vgpr0 = COPY [[FADD]](s32)
- ; GFX9-CONTRACT-LABEL: name: test_add_mul_multiple_defs_z
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-CONTRACT-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ;
; GFX9-DENORM-LABEL: name: test_add_mul_multiple_defs_z
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GFX9-DENORM-NEXT: {{ $}}
@@ -234,18 +261,7 @@ body: |
; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV1]]
; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
- ; GFX9-UNSAFE-LABEL: name: test_add_mul_multiple_defs_z
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-UNSAFE-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ;
; GFX10-LABEL: name: test_add_mul_multiple_defs_z
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GFX10-NEXT: {{ $}}
@@ -259,18 +275,7 @@ body: |
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV1]]
; GFX10-NEXT: $vgpr0 = COPY [[FADD]](s32)
- ; GFX10-CONTRACT-LABEL: name: test_add_mul_multiple_defs_z
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-CONTRACT-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ;
; GFX10-DENORM-LABEL: name: test_add_mul_multiple_defs_z
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GFX10-DENORM-NEXT: {{ $}}
@@ -284,18 +289,6 @@ body: |
; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV1]]
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
- ; GFX10-UNSAFE-LABEL: name: test_add_mul_multiple_defs_z
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-UNSAFE-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%4:_(s32) = COPY $vgpr2
@@ -310,6 +303,76 @@ body: |
...
---
+name: test_add_mul_multiple_defs_z_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+ ; GFX9-LABEL: name: test_add_mul_multiple_defs_z_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
+ ; GFX9-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ;
+ ; GFX9-DENORM-LABEL: name: test_add_mul_multiple_defs_z_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-DENORM-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ;
+ ; GFX10-LABEL: name: test_add_mul_multiple_defs_z_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
+ ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ;
+ ; GFX10-DENORM-LABEL: name: test_add_mul_multiple_defs_z_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-DENORM-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %4:_(s32) = COPY $vgpr2
+ %5:_(s32) = COPY $vgpr3
+ %2:_(p1) = G_MERGE_VALUES %4(s32), %5(s32)
+ %6:_(s32) = contract G_FMUL %0, %1
+ %7:_(<2 x s32>) = G_LOAD %2(p1) :: (load (<2 x s32>), addrspace 1)
+ %12:_(s32), %13:_(s32) = G_UNMERGE_VALUES %7(<2 x s32>)
+ %8:_(s32) = COPY %13(s32)
+ %10:_(s32) = contract G_FADD %6, %8
+ $vgpr0 = COPY %10(s32)
+...
+
+---
name: test_add_mul_rhs_multiple_defs_z
body: |
bb.1.entry:
@@ -328,18 +391,7 @@ body: |
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[UV1]], [[FMUL]]
; GFX9-NEXT: $vgpr0 = COPY [[FADD]](s32)
- ; GFX9-CONTRACT-LABEL: name: test_add_mul_rhs_multiple_defs_z
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-CONTRACT-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ;
; GFX9-DENORM-LABEL: name: test_add_mul_rhs_multiple_defs_z
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GFX9-DENORM-NEXT: {{ $}}
@@ -353,18 +405,7 @@ body: |
; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[UV1]], [[FMUL]]
; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
- ; GFX9-UNSAFE-LABEL: name: test_add_mul_rhs_multiple_defs_z
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-UNSAFE-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ;
; GFX10-LABEL: name: test_add_mul_rhs_multiple_defs_z
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GFX10-NEXT: {{ $}}
@@ -378,18 +419,7 @@ body: |
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[UV1]], [[FMUL]]
; GFX10-NEXT: $vgpr0 = COPY [[FADD]](s32)
- ; GFX10-CONTRACT-LABEL: name: test_add_mul_rhs_multiple_defs_z
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-CONTRACT-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ;
; GFX10-DENORM-LABEL: name: test_add_mul_rhs_multiple_defs_z
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GFX10-DENORM-NEXT: {{ $}}
@@ -403,18 +433,6 @@ body: |
; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[UV1]], [[FMUL]]
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
- ; GFX10-UNSAFE-LABEL: name: test_add_mul_rhs_multiple_defs_z
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-UNSAFE-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%4:_(s32) = COPY $vgpr2
@@ -429,6 +447,76 @@ body: |
...
---
+name: test_add_mul_rhs_multiple_defs_z_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+ ; GFX9-LABEL: name: test_add_mul_rhs_multiple_defs_z_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
+ ; GFX9-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ;
+ ; GFX9-DENORM-LABEL: name: test_add_mul_rhs_multiple_defs_z_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-DENORM-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ;
+ ; GFX10-LABEL: name: test_add_mul_rhs_multiple_defs_z_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
+ ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ;
+ ; GFX10-DENORM-LABEL: name: test_add_mul_rhs_multiple_defs_z_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-DENORM-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %4:_(s32) = COPY $vgpr2
+ %5:_(s32) = COPY $vgpr3
+ %2:_(p1) = G_MERGE_VALUES %4(s32), %5(s32)
+ %6:_(s32) = contract G_FMUL %0, %1
+ %7:_(<2 x s32>) = G_LOAD %2(p1) :: (load (<2 x s32>), addrspace 1)
+ %12:_(s32), %13:_(s32) = G_UNMERGE_VALUES %7(<2 x s32>)
+ %8:_(s32) = COPY %13(s32)
+ %10:_(s32) = contract G_FADD %8, %6
+ $vgpr0 = COPY %10(s32)
+...
+
+---
name: test_half_add_mul
body: |
bb.1.entry:
@@ -448,19 +536,7 @@ body: |
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX9-CONTRACT-LABEL: name: test_half_add_mul
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX9-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
; GFX9-DENORM-LABEL: name: test_half_add_mul
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX9-DENORM-NEXT: {{ $}}
@@ -475,19 +551,7 @@ body: |
; GFX9-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX9-UNSAFE-LABEL: name: test_half_add_mul
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
; GFX10-LABEL: name: test_half_add_mul
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-NEXT: {{ $}}
@@ -502,19 +566,7 @@ body: |
; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX10-CONTRACT-LABEL: name: test_half_add_mul
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX10-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
; GFX10-DENORM-LABEL: name: test_half_add_mul
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-DENORM-NEXT: {{ $}}
@@ -529,19 +581,6 @@ body: |
; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX10-UNSAFE-LABEL: name: test_half_add_mul
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX10-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
%4:_(s32) = COPY $vgpr0
%0:_(s16) = G_TRUNC %4(s32)
%5:_(s32) = COPY $vgpr1
@@ -556,6 +595,80 @@ body: |
...
---
+name: test_half_add_mul_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; GFX9-LABEL: name: test_half_add_mul_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+ ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+ ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX9-DENORM-LABEL: name: test_half_add_mul_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+ ; GFX9-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-LABEL: name: test_half_add_mul_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+ ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+ ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-DENORM-LABEL: name: test_half_add_mul_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+ ; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ %4:_(s32) = COPY $vgpr0
+ %0:_(s16) = G_TRUNC %4(s32)
+ %5:_(s32) = COPY $vgpr1
+ %1:_(s16) = G_TRUNC %5(s32)
+ %6:_(s32) = COPY $vgpr2
+ %2:_(s16) = G_TRUNC %6(s32)
+ %7:_(s16) = contract G_FMUL %0, %1
+ %8:_(s16) = contract G_FADD %7, %2
+ %10:_(s32) = G_ANYEXT %8(s16)
+ $vgpr0 = COPY %10(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+...
+
+---
name: test_half_add_mul_rhs
body: |
bb.1.entry:
@@ -575,19 +688,7 @@ body: |
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX9-CONTRACT-LABEL: name: test_half_add_mul_rhs
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX9-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
; GFX9-DENORM-LABEL: name: test_half_add_mul_rhs
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX9-DENORM-NEXT: {{ $}}
@@ -602,19 +703,7 @@ body: |
; GFX9-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX9-UNSAFE-LABEL: name: test_half_add_mul_rhs
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
; GFX10-LABEL: name: test_half_add_mul_rhs
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-NEXT: {{ $}}
@@ -629,19 +718,7 @@ body: |
; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX10-CONTRACT-LABEL: name: test_half_add_mul_rhs
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX10-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
; GFX10-DENORM-LABEL: name: test_half_add_mul_rhs
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-DENORM-NEXT: {{ $}}
@@ -656,19 +733,6 @@ body: |
; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX10-UNSAFE-LABEL: name: test_half_add_mul_rhs
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX10-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
%4:_(s32) = COPY $vgpr0
%0:_(s16) = G_TRUNC %4(s32)
%5:_(s32) = COPY $vgpr1
@@ -683,6 +747,80 @@ body: |
...
---
+name: test_half_add_mul_rhs_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; GFX9-LABEL: name: test_half_add_mul_rhs_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+ ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+ ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX9-DENORM-LABEL: name: test_half_add_mul_rhs_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+ ; GFX9-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-LABEL: name: test_half_add_mul_rhs_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+ ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+ ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-DENORM-LABEL: name: test_half_add_mul_rhs_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+ ; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ %4:_(s32) = COPY $vgpr0
+ %0:_(s16) = G_TRUNC %4(s32)
+ %5:_(s32) = COPY $vgpr1
+ %1:_(s16) = G_TRUNC %5(s32)
+ %6:_(s32) = COPY $vgpr2
+ %2:_(s16) = G_TRUNC %6(s32)
+ %7:_(s16) = contract G_FMUL %0, %1
+ %8:_(s16) = contract G_FADD %2, %7
+ %10:_(s32) = G_ANYEXT %8(s16)
+ $vgpr0 = COPY %10(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+...
+
+---
name: test_double_add_mul
body: |
bb.1.entry:
@@ -706,23 +844,7 @@ body: |
; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ; GFX9-CONTRACT-LABEL: name: test_double_add_mul
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
; GFX9-DENORM-LABEL: name: test_double_add_mul
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX9-DENORM-NEXT: {{ $}}
@@ -741,23 +863,7 @@ body: |
; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ; GFX9-UNSAFE-LABEL: name: test_double_add_mul
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
; GFX10-LABEL: name: test_double_add_mul
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-NEXT: {{ $}}
@@ -776,23 +882,7 @@ body: |
; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ; GFX10-CONTRACT-LABEL: name: test_double_add_mul
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
; GFX10-DENORM-LABEL: name: test_double_add_mul
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-DENORM-NEXT: {{ $}}
@@ -811,23 +901,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ; GFX10-UNSAFE-LABEL: name: test_double_add_mul
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
%4:_(s32) = COPY $vgpr0
%5:_(s32) = COPY $vgpr1
%0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
@@ -846,6 +919,101 @@ body: |
...
---
+name: test_double_add_mul_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; GFX9-LABEL: name: test_double_add_mul_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX9-DENORM-LABEL: name: test_double_add_mul_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-LABEL: name: test_double_add_mul_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-DENORM-LABEL: name: test_double_add_mul_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ %4:_(s32) = COPY $vgpr0
+ %5:_(s32) = COPY $vgpr1
+ %0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+ %6:_(s32) = COPY $vgpr2
+ %7:_(s32) = COPY $vgpr3
+ %1:_(s64) = G_MERGE_VALUES %6(s32), %7(s32)
+ %8:_(s32) = COPY $vgpr4
+ %9:_(s32) = COPY $vgpr5
+ %2:_(s64) = G_MERGE_VALUES %8(s32), %9(s32)
+ %10:_(s64) = contract G_FMUL %0, %1
+ %11:_(s64) = contract G_FADD %10, %2
+ %13:_(s32), %14:_(s32) = G_UNMERGE_VALUES %11(s64)
+ $vgpr0 = COPY %13(s32)
+ $vgpr1 = COPY %14(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
+
+
+---
name: test_double_add_mul_rhs
body: |
bb.1.entry:
@@ -869,23 +1037,7 @@ body: |
; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ; GFX9-CONTRACT-LABEL: name: test_double_add_mul_rhs
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
; GFX9-DENORM-LABEL: name: test_double_add_mul_rhs
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX9-DENORM-NEXT: {{ $}}
@@ -904,23 +1056,7 @@ body: |
; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ; GFX9-UNSAFE-LABEL: name: test_double_add_mul_rhs
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
; GFX10-LABEL: name: test_double_add_mul_rhs
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-NEXT: {{ $}}
@@ -939,23 +1075,7 @@ body: |
; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ; GFX10-CONTRACT-LABEL: name: test_double_add_mul_rhs
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
; GFX10-DENORM-LABEL: name: test_double_add_mul_rhs
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-DENORM-NEXT: {{ $}}
@@ -974,23 +1094,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ; GFX10-UNSAFE-LABEL: name: test_double_add_mul_rhs
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
%4:_(s32) = COPY $vgpr0
%5:_(s32) = COPY $vgpr1
%0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
@@ -1009,6 +1112,100 @@ body: |
...
---
+name: test_double_add_mul_rhs_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; GFX9-LABEL: name: test_double_add_mul_rhs_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX9-DENORM-LABEL: name: test_double_add_mul_rhs_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-LABEL: name: test_double_add_mul_rhs_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-DENORM-LABEL: name: test_double_add_mul_rhs_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ %4:_(s32) = COPY $vgpr0
+ %5:_(s32) = COPY $vgpr1
+ %0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+ %6:_(s32) = COPY $vgpr2
+ %7:_(s32) = COPY $vgpr3
+ %1:_(s64) = G_MERGE_VALUES %6(s32), %7(s32)
+ %8:_(s32) = COPY $vgpr4
+ %9:_(s32) = COPY $vgpr5
+ %2:_(s64) = G_MERGE_VALUES %8(s32), %9(s32)
+ %10:_(s64) = contract G_FMUL %0, %1
+ %11:_(s64) = contract G_FADD %2, %10
+ %13:_(s32), %14:_(s32) = G_UNMERGE_VALUES %11(s64)
+ $vgpr0 = COPY %13(s32)
+ $vgpr1 = COPY %14(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
+
+---
name: test_4xfloat_add_mul
body: |
bb.1.entry:
@@ -1040,32 +1237,7 @@ body: |
; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
- ; GFX9-CONTRACT-LABEL: name: test_4xfloat_add_mul
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX9-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX9-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
- ; GFX9-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX9-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ;
; GFX9-DENORM-LABEL: name: test_4xfloat_add_mul
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
; GFX9-DENORM-NEXT: {{ $}}
@@ -1092,32 +1264,7 @@ body: |
; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
- ; GFX9-UNSAFE-LABEL: name: test_4xfloat_add_mul
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX9-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX9-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
- ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX9-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ;
; GFX10-LABEL: name: test_4xfloat_add_mul
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
; GFX10-NEXT: {{ $}}
@@ -1144,32 +1291,7 @@ body: |
; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
- ; GFX10-CONTRACT-LABEL: name: test_4xfloat_add_mul
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX10-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX10-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
- ; GFX10-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX10-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ;
; GFX10-DENORM-LABEL: name: test_4xfloat_add_mul
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
; GFX10-DENORM-NEXT: {{ $}}
@@ -1196,32 +1318,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
- ; GFX10-UNSAFE-LABEL: name: test_4xfloat_add_mul
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX10-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX10-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
- ; GFX10-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX10-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%4:_(s32) = COPY $vgpr0
%5:_(s32) = COPY $vgpr1
%6:_(s32) = COPY $vgpr2
@@ -1248,6 +1344,144 @@ body: |
...
---
+name: test_4xfloat_add_mul_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+
+ ; GFX9-LABEL: name: test_4xfloat_add_mul_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]]
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ;
+ ; GFX9-DENORM-LABEL: name: test_4xfloat_add_mul_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX9-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX9-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]]
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ;
+ ; GFX10-LABEL: name: test_4xfloat_add_mul_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]]
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ;
+ ; GFX10-DENORM-LABEL: name: test_4xfloat_add_mul_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX10-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX10-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]]
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ %4:_(s32) = COPY $vgpr0
+ %5:_(s32) = COPY $vgpr1
+ %6:_(s32) = COPY $vgpr2
+ %7:_(s32) = COPY $vgpr3
+ %0:_(<4 x s32>) = G_BUILD_VECTOR %4(s32), %5(s32), %6(s32), %7(s32)
+ %8:_(s32) = COPY $vgpr4
+ %9:_(s32) = COPY $vgpr5
+ %10:_(s32) = COPY $vgpr6
+ %11:_(s32) = COPY $vgpr7
+ %1:_(<4 x s32>) = G_BUILD_VECTOR %8(s32), %9(s32), %10(s32), %11(s32)
+ %12:_(s32) = COPY $vgpr8
+ %13:_(s32) = COPY $vgpr9
+ %14:_(s32) = COPY $vgpr10
+ %15:_(s32) = COPY $vgpr11
+ %2:_(<4 x s32>) = G_BUILD_VECTOR %12(s32), %13(s32), %14(s32), %15(s32)
+ %16:_(<4 x s32>) = contract G_FMUL %0, %1
+ %17:_(<4 x s32>) = contract G_FADD %16, %2
+ %19:_(s32), %20:_(s32), %21:_(s32), %22:_(s32) = G_UNMERGE_VALUES %17(<4 x s32>)
+ $vgpr0 = COPY %19(s32)
+ $vgpr1 = COPY %20(s32)
+ $vgpr2 = COPY %21(s32)
+ $vgpr3 = COPY %22(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+...
+
+---
name: test_3xfloat_add_mul_rhs
body: |
bb.1.entry:
@@ -1275,28 +1509,7 @@ body: |
; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
- ; GFX9-CONTRACT-LABEL: name: test_3xfloat_add_mul_rhs
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
- ; GFX9-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX9-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ ;
; GFX9-DENORM-LABEL: name: test_3xfloat_add_mul_rhs
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; GFX9-DENORM-NEXT: {{ $}}
@@ -1319,28 +1532,7 @@ body: |
; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
- ; GFX9-UNSAFE-LABEL: name: test_3xfloat_add_mul_rhs
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
- ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX9-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ ;
; GFX10-LABEL: name: test_3xfloat_add_mul_rhs
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; GFX10-NEXT: {{ $}}
@@ -1363,28 +1555,7 @@ body: |
; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
- ; GFX10-CONTRACT-LABEL: name: test_3xfloat_add_mul_rhs
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
- ; GFX10-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX10-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ ;
; GFX10-DENORM-LABEL: name: test_3xfloat_add_mul_rhs
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; GFX10-DENORM-NEXT: {{ $}}
@@ -1407,28 +1578,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
- ; GFX10-UNSAFE-LABEL: name: test_3xfloat_add_mul_rhs
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
- ; GFX10-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX10-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%4:_(s32) = COPY $vgpr0
%5:_(s32) = COPY $vgpr1
%6:_(s32) = COPY $vgpr2
@@ -1451,6 +1600,124 @@ body: |
...
---
+name: test_3xfloat_add_mul_rhs_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+
+ ; GFX9-LABEL: name: test_3xfloat_add_mul_rhs_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+ ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ ;
+ ; GFX9-DENORM-LABEL: name: test_3xfloat_add_mul_rhs_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+ ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ ;
+ ; GFX10-LABEL: name: test_3xfloat_add_mul_rhs_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+ ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ ;
+ ; GFX10-DENORM-LABEL: name: test_3xfloat_add_mul_rhs_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+ ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ %4:_(s32) = COPY $vgpr0
+ %5:_(s32) = COPY $vgpr1
+ %6:_(s32) = COPY $vgpr2
+ %0:_(<3 x s32>) = G_BUILD_VECTOR %4(s32), %5(s32), %6(s32)
+ %7:_(s32) = COPY $vgpr3
+ %8:_(s32) = COPY $vgpr4
+ %9:_(s32) = COPY $vgpr5
+ %1:_(<3 x s32>) = G_BUILD_VECTOR %7(s32), %8(s32), %9(s32)
+ %10:_(s32) = COPY $vgpr6
+ %11:_(s32) = COPY $vgpr7
+ %12:_(s32) = COPY $vgpr8
+ %2:_(<3 x s32>) = G_BUILD_VECTOR %10(s32), %11(s32), %12(s32)
+ %13:_(<3 x s32>) = contract G_FMUL %0, %1
+ %14:_(<3 x s32>) = contract G_FADD %2, %13
+ %16:_(s32), %17:_(s32), %18:_(s32) = G_UNMERGE_VALUES %14(<3 x s32>)
+ $vgpr0 = COPY %16(s32)
+ $vgpr1 = COPY %17(s32)
+ $vgpr2 = COPY %18(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+...
+
+---
name: test_4xhalf_add_mul
body: |
bb.1.entry:
@@ -1474,24 +1741,7 @@ body: |
; GFX9-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
; GFX9-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ; GFX9-CONTRACT-LABEL: name: test_4xhalf_add_mul
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
- ; GFX9-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
; GFX9-DENORM-LABEL: name: test_4xhalf_add_mul
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX9-DENORM-NEXT: {{ $}}
@@ -1510,24 +1760,7 @@ body: |
; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ; GFX9-UNSAFE-LABEL: name: test_4xhalf_add_mul
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
- ; GFX9-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
; GFX10-LABEL: name: test_4xhalf_add_mul
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-NEXT: {{ $}}
@@ -1546,24 +1779,7 @@ body: |
; GFX10-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
; GFX10-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ; GFX10-CONTRACT-LABEL: name: test_4xhalf_add_mul
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
- ; GFX10-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
; GFX10-DENORM-LABEL: name: test_4xhalf_add_mul
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-DENORM-NEXT: {{ $}}
@@ -1582,24 +1798,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ; GFX10-UNSAFE-LABEL: name: test_4xhalf_add_mul
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
- ; GFX10-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
%4:_(<2 x s16>) = COPY $vgpr0
%5:_(<2 x s16>) = COPY $vgpr1
%0:_(<4 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>)
@@ -1618,6 +1816,105 @@ body: |
...
---
+name: test_4xhalf_add_mul_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; GFX9-LABEL: name: test_4xhalf_add_mul_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX9-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX9-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+ ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = contract G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
+ ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = contract G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX9-DENORM-LABEL: name: test_4xhalf_add_mul_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = contract G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
+ ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = contract G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-LABEL: name: test_4xhalf_add_mul_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX10-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX10-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+ ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = contract G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
+ ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = contract G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-DENORM-LABEL: name: test_4xhalf_add_mul_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = contract G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
+ ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = contract G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ %4:_(<2 x s16>) = COPY $vgpr0
+ %5:_(<2 x s16>) = COPY $vgpr1
+ %0:_(<4 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>)
+ %6:_(<2 x s16>) = COPY $vgpr2
+ %7:_(<2 x s16>) = COPY $vgpr3
+ %1:_(<4 x s16>) = G_CONCAT_VECTORS %6(<2 x s16>), %7(<2 x s16>)
+ %8:_(<2 x s16>) = COPY $vgpr4
+ %9:_(<2 x s16>) = COPY $vgpr5
+ %2:_(<4 x s16>) = G_CONCAT_VECTORS %8(<2 x s16>), %9(<2 x s16>)
+ %10:_(<4 x s16>) = contract G_FMUL %0, %1
+ %11:_(<4 x s16>) = contract G_FADD %10, %2
+ %13:_(<2 x s16>), %14:_(<2 x s16>) = G_UNMERGE_VALUES %11(<4 x s16>)
+ $vgpr0 = COPY %13(<2 x s16>)
+ $vgpr1 = COPY %14(<2 x s16>)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
+
+
+---
name: test_3xhalf_add_mul_rhs
body: |
bb.1.entry:
@@ -1648,31 +1945,6 @@ body: |
; GFX9-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX9-CONTRACT-LABEL: name: test_3xhalf_add_mul_rhs
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
- ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
- ; GFX9-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = G_FMUL [[UV]], [[UV2]]
- ; GFX9-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = G_FADD [[UV4]], [[FMUL]]
- ; GFX9-CONTRACT-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
- ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>)
- ; GFX9-CONTRACT-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX9-DENORM-LABEL: name: test_3xhalf_add_mul_rhs
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX9-DENORM-NEXT: {{ $}}
@@ -1698,31 +1970,6 @@ body: |
; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX9-UNSAFE-LABEL: name: test_3xhalf_add_mul_rhs
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
- ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
- ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = G_FMUL [[UV]], [[UV2]]
- ; GFX9-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = G_FADD [[UV4]], [[FMUL]]
- ; GFX9-UNSAFE-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
- ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>)
- ; GFX9-UNSAFE-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX10-LABEL: name: test_3xhalf_add_mul_rhs
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-NEXT: {{ $}}
@@ -1748,31 +1995,6 @@ body: |
; GFX10-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX10-CONTRACT-LABEL: name: test_3xhalf_add_mul_rhs
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
- ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
- ; GFX10-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = G_FMUL [[UV]], [[UV2]]
- ; GFX10-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = G_FADD [[UV4]], [[FMUL]]
- ; GFX10-CONTRACT-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
- ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>)
- ; GFX10-CONTRACT-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX10-DENORM-LABEL: name: test_3xhalf_add_mul_rhs
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-DENORM-NEXT: {{ $}}
@@ -1797,31 +2019,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
- ; GFX10-UNSAFE-LABEL: name: test_3xhalf_add_mul_rhs
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
- ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
- ; GFX10-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = G_FMUL [[UV]], [[UV2]]
- ; GFX10-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = G_FADD [[UV4]], [[FMUL]]
- ; GFX10-UNSAFE-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
- ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>)
- ; GFX10-UNSAFE-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
%4:_(<2 x s16>) = COPY $vgpr0
%5:_(<2 x s16>) = COPY $vgpr1
%10:_(<2 x s16>) = G_IMPLICIT_DEF
@@ -1846,6 +2043,134 @@ body: |
...
---
+name: test_3xhalf_add_mul_rhs_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; GFX9-LABEL: name: test_3xhalf_add_mul_rhs_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+ ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX9-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX9-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
+ ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = contract G_FMUL [[UV]], [[UV2]]
+ ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = contract G_FADD [[UV4]], [[FMUL]]
+ ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
+ ; GFX9-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>)
+ ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX9-DENORM-LABEL: name: test_3xhalf_add_mul_rhs_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+ ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
+ ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = contract G_FMUL [[UV]], [[UV2]]
+ ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = contract G_FADD [[UV4]], [[FMUL]]
+ ; GFX9-DENORM-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
+ ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>)
+ ; GFX9-DENORM-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-LABEL: name: test_3xhalf_add_mul_rhs_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+ ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX10-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX10-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX10-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
+ ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = contract G_FMUL [[UV]], [[UV2]]
+ ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = contract G_FADD [[UV4]], [[FMUL]]
+ ; GFX10-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
+ ; GFX10-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>)
+ ; GFX10-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-DENORM-LABEL: name: test_3xhalf_add_mul_rhs_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+ ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
+ ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = contract G_FMUL [[UV]], [[UV2]]
+ ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = contract G_FADD [[UV4]], [[FMUL]]
+ ; GFX10-DENORM-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
+ ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>)
+ ; GFX10-DENORM-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ %4:_(<2 x s16>) = COPY $vgpr0
+ %5:_(<2 x s16>) = COPY $vgpr1
+ %10:_(<2 x s16>) = G_IMPLICIT_DEF
+ %11:_(<6 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>), %10(<2 x s16>)
+ %0:_(<3 x s16>), %12:_(<3 x s16>) = G_UNMERGE_VALUES %11(<6 x s16>)
+ %6:_(<2 x s16>) = COPY $vgpr2
+ %7:_(<2 x s16>) = COPY $vgpr3
+ %13:_(<6 x s16>) = G_CONCAT_VECTORS %6(<2 x s16>), %7(<2 x s16>), %10(<2 x s16>)
+ %1:_(<3 x s16>), %14:_(<3 x s16>) = G_UNMERGE_VALUES %13(<6 x s16>)
+ %8:_(<2 x s16>) = COPY $vgpr4
+ %9:_(<2 x s16>) = COPY $vgpr5
+ %15:_(<6 x s16>) = G_CONCAT_VECTORS %8(<2 x s16>), %9(<2 x s16>), %10(<2 x s16>)
+ %2:_(<3 x s16>), %16:_(<3 x s16>) = G_UNMERGE_VALUES %15(<6 x s16>)
+ %17:_(<3 x s16>) = contract G_FMUL %0, %1
+ %18:_(<3 x s16>) = contract G_FADD %2, %17
+ %22:_(<3 x s16>) = G_IMPLICIT_DEF
+ %23:_(<6 x s16>) = G_CONCAT_VECTORS %18(<3 x s16>), %22(<3 x s16>)
+ %20:_(<2 x s16>), %21:_(<2 x s16>), %24:_(<2 x s16>) = G_UNMERGE_VALUES %23(<6 x s16>)
+ $vgpr0 = COPY %20(<2 x s16>)
+ $vgpr1 = COPY %21(<2 x s16>)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
+
+---
name: test_4xdouble_add_mul
body: |
bb.1.entry:
@@ -1905,60 +2230,7 @@ body: |
; GFX9-NEXT: $vgpr6 = COPY [[UV6]](s32)
; GFX9-NEXT: $vgpr7 = COPY [[UV7]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
- ; GFX9-CONTRACT-LABEL: name: test_4xdouble_add_mul
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
- ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX9-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX9-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX9-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX9-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX9-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX9-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX9-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
- ; GFX9-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX9-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX9-CONTRACT-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
- ; GFX9-CONTRACT-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
- ; GFX9-CONTRACT-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
- ; GFX9-CONTRACT-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
- ; GFX9-CONTRACT-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
- ; GFX9-CONTRACT-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
- ; GFX9-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
- ; GFX9-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX9-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr6 = COPY [[UV6]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr7 = COPY [[UV7]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
; GFX9-DENORM-LABEL: name: test_4xdouble_add_mul
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX9-DENORM-NEXT: {{ $}}
@@ -2013,60 +2285,7 @@ body: |
; GFX9-DENORM-NEXT: $vgpr6 = COPY [[UV6]](s32)
; GFX9-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
- ; GFX9-UNSAFE-LABEL: name: test_4xdouble_add_mul
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
- ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX9-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX9-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX9-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX9-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX9-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX9-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX9-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
- ; GFX9-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX9-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX9-UNSAFE-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
- ; GFX9-UNSAFE-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
- ; GFX9-UNSAFE-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
- ; GFX9-UNSAFE-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
- ; GFX9-UNSAFE-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
- ; GFX9-UNSAFE-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
- ; GFX9-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
- ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX9-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr6 = COPY [[UV6]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr7 = COPY [[UV7]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
; GFX10-LABEL: name: test_4xdouble_add_mul
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX10-NEXT: {{ $}}
@@ -2121,60 +2340,7 @@ body: |
; GFX10-NEXT: $vgpr6 = COPY [[UV6]](s32)
; GFX10-NEXT: $vgpr7 = COPY [[UV7]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
- ; GFX10-CONTRACT-LABEL: name: test_4xdouble_add_mul
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
- ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX10-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX10-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX10-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX10-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX10-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX10-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX10-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
- ; GFX10-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX10-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX10-CONTRACT-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
- ; GFX10-CONTRACT-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
- ; GFX10-CONTRACT-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
- ; GFX10-CONTRACT-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
- ; GFX10-CONTRACT-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
- ; GFX10-CONTRACT-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
- ; GFX10-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
- ; GFX10-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX10-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr6 = COPY [[UV6]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr7 = COPY [[UV7]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
; GFX10-DENORM-LABEL: name: test_4xdouble_add_mul
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX10-DENORM-NEXT: {{ $}}
@@ -2229,60 +2395,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr6 = COPY [[UV6]](s32)
; GFX10-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
- ; GFX10-UNSAFE-LABEL: name: test_4xdouble_add_mul
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
- ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX10-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX10-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX10-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX10-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX10-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX10-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX10-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
- ; GFX10-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX10-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX10-UNSAFE-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
- ; GFX10-UNSAFE-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
- ; GFX10-UNSAFE-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
- ; GFX10-UNSAFE-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
- ; GFX10-UNSAFE-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
- ; GFX10-UNSAFE-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
- ; GFX10-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
- ; GFX10-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX10-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr6 = COPY [[UV6]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr7 = COPY [[UV7]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
%4:_(s32) = COPY $vgpr0
%5:_(s32) = COPY $vgpr1
%6:_(s32) = COPY $vgpr2
@@ -2337,6 +2449,284 @@ body: |
...
---
+name: test_4xdouble_add_mul_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+ ; GFX9-LABEL: name: test_4xdouble_add_mul_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+ ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX9-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX9-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX9-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
+ ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
+ ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
+ ; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
+ ; GFX9-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
+ ; GFX9-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
+ ; GFX9-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX9-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
+ ; GFX9-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
+ ; GFX9-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
+ ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+ ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]]
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX9-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX9-NEXT: $vgpr6 = COPY [[UV6]](s32)
+ ; GFX9-NEXT: $vgpr7 = COPY [[UV7]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX9-DENORM-LABEL: name: test_4xdouble_add_mul_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+ ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX9-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX9-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX9-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX9-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX9-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX9-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX9-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX9-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX9-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX9-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
+ ; GFX9-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX9-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX9-DENORM-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
+ ; GFX9-DENORM-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
+ ; GFX9-DENORM-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
+ ; GFX9-DENORM-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
+ ; GFX9-DENORM-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
+ ; GFX9-DENORM-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
+ ; GFX9-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX9-DENORM-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
+ ; GFX9-DENORM-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
+ ; GFX9-DENORM-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+ ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]]
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr6 = COPY [[UV6]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX10-LABEL: name: test_4xdouble_add_mul_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+ ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX10-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX10-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX10-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX10-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
+ ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
+ ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
+ ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
+ ; GFX10-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
+ ; GFX10-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX10-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
+ ; GFX10-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
+ ; GFX10-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
+ ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+ ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]]
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX10-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX10-NEXT: $vgpr6 = COPY [[UV6]](s32)
+ ; GFX10-NEXT: $vgpr7 = COPY [[UV7]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX10-DENORM-LABEL: name: test_4xdouble_add_mul_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+ ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX10-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX10-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX10-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX10-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX10-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX10-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX10-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX10-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX10-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX10-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
+ ; GFX10-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX10-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX10-DENORM-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
+ ; GFX10-DENORM-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
+ ; GFX10-DENORM-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
+ ; GFX10-DENORM-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
+ ; GFX10-DENORM-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
+ ; GFX10-DENORM-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
+ ; GFX10-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX10-DENORM-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
+ ; GFX10-DENORM-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
+ ; GFX10-DENORM-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+ ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]]
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr6 = COPY [[UV6]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ %4:_(s32) = COPY $vgpr0
+ %5:_(s32) = COPY $vgpr1
+ %6:_(s32) = COPY $vgpr2
+ %7:_(s32) = COPY $vgpr3
+ %8:_(s32) = COPY $vgpr4
+ %9:_(s32) = COPY $vgpr5
+ %10:_(s32) = COPY $vgpr6
+ %11:_(s32) = COPY $vgpr7
+ %28:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+ %29:_(s64) = G_MERGE_VALUES %6(s32), %7(s32)
+ %30:_(s64) = G_MERGE_VALUES %8(s32), %9(s32)
+ %31:_(s64) = G_MERGE_VALUES %10(s32), %11(s32)
+ %0:_(<4 x s64>) = G_BUILD_VECTOR %28(s64), %29(s64), %30(s64), %31(s64)
+ %12:_(s32) = COPY $vgpr8
+ %13:_(s32) = COPY $vgpr9
+ %14:_(s32) = COPY $vgpr10
+ %15:_(s32) = COPY $vgpr11
+ %16:_(s32) = COPY $vgpr12
+ %17:_(s32) = COPY $vgpr13
+ %18:_(s32) = COPY $vgpr14
+ %19:_(s32) = COPY $vgpr15
+ %32:_(s64) = G_MERGE_VALUES %12(s32), %13(s32)
+ %33:_(s64) = G_MERGE_VALUES %14(s32), %15(s32)
+ %34:_(s64) = G_MERGE_VALUES %16(s32), %17(s32)
+ %35:_(s64) = G_MERGE_VALUES %18(s32), %19(s32)
+ %1:_(<4 x s64>) = G_BUILD_VECTOR %32(s64), %33(s64), %34(s64), %35(s64)
+ %20:_(s32) = COPY $vgpr16
+ %21:_(s32) = COPY $vgpr17
+ %22:_(s32) = COPY $vgpr18
+ %23:_(s32) = COPY $vgpr19
+ %24:_(s32) = COPY $vgpr20
+ %25:_(s32) = COPY $vgpr21
+ %26:_(s32) = COPY $vgpr22
+ %27:_(s32) = COPY $vgpr23
+ %36:_(s64) = G_MERGE_VALUES %20(s32), %21(s32)
+ %37:_(s64) = G_MERGE_VALUES %22(s32), %23(s32)
+ %38:_(s64) = G_MERGE_VALUES %24(s32), %25(s32)
+ %39:_(s64) = G_MERGE_VALUES %26(s32), %27(s32)
+ %2:_(<4 x s64>) = G_BUILD_VECTOR %36(s64), %37(s64), %38(s64), %39(s64)
+ %40:_(<4 x s64>) = contract G_FMUL %0, %1
+ %41:_(<4 x s64>) = contract G_FADD %40, %2
+ %43:_(s32), %44:_(s32), %45:_(s32), %46:_(s32), %47:_(s32), %48:_(s32), %49:_(s32), %50:_(s32) = G_UNMERGE_VALUES %41(<4 x s64>)
+ $vgpr0 = COPY %43(s32)
+ $vgpr1 = COPY %44(s32)
+ $vgpr2 = COPY %45(s32)
+ $vgpr3 = COPY %46(s32)
+ $vgpr4 = COPY %47(s32)
+ $vgpr5 = COPY %48(s32)
+ $vgpr6 = COPY %49(s32)
+ $vgpr7 = COPY %50(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+...
+
+---
name: test_3xdouble_add_mul_rhs
body: |
bb.1.entry:
@@ -2385,49 +2775,7 @@ body: |
; GFX9-NEXT: $vgpr4 = COPY [[UV4]](s32)
; GFX9-NEXT: $vgpr5 = COPY [[UV5]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
- ; GFX9-CONTRACT-LABEL: name: test_3xdouble_add_mul_rhs
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
- ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX9-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX9-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX9-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
- ; GFX9-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX9-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX9-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX9-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX9-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX9-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX9-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
- ; GFX9-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX9-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+ ;
; GFX9-DENORM-LABEL: name: test_3xdouble_add_mul_rhs
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
; GFX9-DENORM-NEXT: {{ $}}
@@ -2471,49 +2819,7 @@ body: |
; GFX9-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
; GFX9-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
- ; GFX9-UNSAFE-LABEL: name: test_3xdouble_add_mul_rhs
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
- ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX9-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX9-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX9-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
- ; GFX9-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX9-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX9-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX9-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX9-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX9-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX9-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
- ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX9-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+ ;
; GFX10-LABEL: name: test_3xdouble_add_mul_rhs
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
; GFX10-NEXT: {{ $}}
@@ -2557,49 +2863,7 @@ body: |
; GFX10-NEXT: $vgpr4 = COPY [[UV4]](s32)
; GFX10-NEXT: $vgpr5 = COPY [[UV5]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
- ; GFX10-CONTRACT-LABEL: name: test_3xdouble_add_mul_rhs
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
- ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX10-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX10-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX10-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
- ; GFX10-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX10-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX10-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX10-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX10-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX10-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX10-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
- ; GFX10-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX10-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+ ;
; GFX10-DENORM-LABEL: name: test_3xdouble_add_mul_rhs
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
; GFX10-DENORM-NEXT: {{ $}}
@@ -2643,49 +2907,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
; GFX10-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
- ; GFX10-UNSAFE-LABEL: name: test_3xdouble_add_mul_rhs
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
- ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX10-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX10-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX10-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
- ; GFX10-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX10-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX10-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX10-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX10-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX10-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX10-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
- ; GFX10-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX10-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
%4:_(s32) = COPY $vgpr0
%5:_(s32) = COPY $vgpr1
%6:_(s32) = COPY $vgpr2
@@ -2727,3 +2948,226 @@ body: |
$vgpr5 = COPY %39(s32)
S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
...
+
+---
+name: test_3xdouble_add_mul_rhs_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+
+ ; GFX9-LABEL: name: test_3xdouble_add_mul_rhs_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
+ ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX9-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX9-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX9-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX9-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+ ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX9-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+ ;
+ ; GFX9-DENORM-LABEL: name: test_3xdouble_add_mul_rhs_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
+ ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX9-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX9-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX9-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX9-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX9-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
+ ; GFX9-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX9-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX9-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX9-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX9-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX9-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX9-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX9-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX9-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+ ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+ ;
+ ; GFX10-LABEL: name: test_3xdouble_add_mul_rhs_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX10-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX10-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
+ ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX10-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX10-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX10-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+ ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX10-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+ ;
+ ; GFX10-DENORM-LABEL: name: test_3xdouble_add_mul_rhs_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
+ ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX10-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX10-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX10-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX10-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
+ ; GFX10-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX10-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX10-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX10-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX10-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX10-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX10-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX10-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX10-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+ ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+ %4:_(s32) = COPY $vgpr0
+ %5:_(s32) = COPY $vgpr1
+ %6:_(s32) = COPY $vgpr2
+ %7:_(s32) = COPY $vgpr3
+ %8:_(s32) = COPY $vgpr4
+ %9:_(s32) = COPY $vgpr5
+ %22:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+ %23:_(s64) = G_MERGE_VALUES %6(s32), %7(s32)
+ %24:_(s64) = G_MERGE_VALUES %8(s32), %9(s32)
+ %0:_(<3 x s64>) = G_BUILD_VECTOR %22(s64), %23(s64), %24(s64)
+ %10:_(s32) = COPY $vgpr6
+ %11:_(s32) = COPY $vgpr7
+ %12:_(s32) = COPY $vgpr8
+ %13:_(s32) = COPY $vgpr9
+ %14:_(s32) = COPY $vgpr10
+ %15:_(s32) = COPY $vgpr11
+ %25:_(s64) = G_MERGE_VALUES %10(s32), %11(s32)
+ %26:_(s64) = G_MERGE_VALUES %12(s32), %13(s32)
+ %27:_(s64) = G_MERGE_VALUES %14(s32), %15(s32)
+ %1:_(<3 x s64>) = G_BUILD_VECTOR %25(s64), %26(s64), %27(s64)
+ %16:_(s32) = COPY $vgpr12
+ %17:_(s32) = COPY $vgpr13
+ %18:_(s32) = COPY $vgpr14
+ %19:_(s32) = COPY $vgpr15
+ %20:_(s32) = COPY $vgpr16
+ %21:_(s32) = COPY $vgpr17
+ %28:_(s64) = G_MERGE_VALUES %16(s32), %17(s32)
+ %29:_(s64) = G_MERGE_VALUES %18(s32), %19(s32)
+ %30:_(s64) = G_MERGE_VALUES %20(s32), %21(s32)
+ %2:_(<3 x s64>) = G_BUILD_VECTOR %28(s64), %29(s64), %30(s64)
+ %31:_(<3 x s64>) = contract G_FMUL %0, %1
+ %32:_(<3 x s64>) = contract G_FADD %2, %31
+ %34:_(s32), %35:_(s32), %36:_(s32), %37:_(s32), %38:_(s32), %39:_(s32) = G_UNMERGE_VALUES %32(<3 x s64>)
+ $vgpr0 = COPY %34(s32)
+ $vgpr1 = COPY %35(s32)
+ $vgpr2 = COPY %36(s32)
+ $vgpr3 = COPY %37(s32)
+ $vgpr4 = COPY %38(s32)
+ $vgpr5 = COPY %39(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir
index 42e53be..8f9fc67 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir
@@ -1,12 +1,8 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck -check-prefix=GFX9 %s
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-prelegalizer-combiner -fp-contract=fast %s -o - | FileCheck -check-prefix=GFX9-CONTRACT %s
# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-prelegalizer-combiner --denormal-fp-math=preserve-sign %s -o - | FileCheck -check-prefix=GFX9-DENORM %s
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-prelegalizer-combiner -enable-unsafe-fp-math %s -o - | FileCheck -check-prefix=GFX9-UNSAFE %s
# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck -check-prefix=GFX10 %s
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-prelegalizer-combiner -fp-contract=fast %s -o - | FileCheck -check-prefix=GFX10-CONTRACT %s
# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-prelegalizer-combiner --denormal-fp-math=preserve-sign %s -o - | FileCheck -check-prefix=GFX10-DENORM %s
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-prelegalizer-combiner -enable-unsafe-fp-math %s -o - | FileCheck -check-prefix=GFX10-UNSAFE %s
---
name: test_f32_add_mul
@@ -25,16 +21,6 @@ body: |
; GFX9-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
;
- ; GFX9-CONTRACT-LABEL: name: test_f32_add_mul
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
; GFX9-DENORM-LABEL: name: test_f32_add_mul
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX9-DENORM-NEXT: {{ $}}
@@ -46,16 +32,6 @@ body: |
; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
;
- ; GFX9-UNSAFE-LABEL: name: test_f32_add_mul
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
; GFX10-LABEL: name: test_f32_add_mul
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-NEXT: {{ $}}
@@ -67,16 +43,6 @@ body: |
; GFX10-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
;
- ; GFX10-CONTRACT-LABEL: name: test_f32_add_mul
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
; GFX10-DENORM-LABEL: name: test_f32_add_mul
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-DENORM-NEXT: {{ $}}
@@ -87,16 +53,6 @@ body: |
; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = reassoc G_FADD [[FMUL]], [[COPY2]]
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
- ; GFX10-UNSAFE-LABEL: name: test_f32_add_mul
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s32) = COPY $vgpr2
@@ -107,6 +63,60 @@ body: |
...
---
+name: test_f32_add_mul_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; GFX9-LABEL: name: test_f32_add_mul_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX9-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX9-DENORM-LABEL: name: test_f32_add_mul_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-LABEL: name: test_f32_add_mul_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-DENORM-LABEL: name: test_f32_add_mul_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(s32) = COPY $vgpr2
+ %4:_(s32) = reassoc contract G_FMUL %0, %1
+ %5:_(s32) = reassoc contract G_FADD %4, %2
+ $vgpr0 = COPY %5(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+...
+
+---
name: test_f32_add_mul_rhs
body: |
bb.1.entry:
@@ -123,16 +133,6 @@ body: |
; GFX9-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
;
- ; GFX9-CONTRACT-LABEL: name: test_f32_add_mul_rhs
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
; GFX9-DENORM-LABEL: name: test_f32_add_mul_rhs
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX9-DENORM-NEXT: {{ $}}
@@ -144,16 +144,6 @@ body: |
; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
;
- ; GFX9-UNSAFE-LABEL: name: test_f32_add_mul_rhs
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
; GFX10-LABEL: name: test_f32_add_mul_rhs
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-NEXT: {{ $}}
@@ -165,16 +155,6 @@ body: |
; GFX10-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
;
- ; GFX10-CONTRACT-LABEL: name: test_f32_add_mul_rhs
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
; GFX10-DENORM-LABEL: name: test_f32_add_mul_rhs
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-DENORM-NEXT: {{ $}}
@@ -185,16 +165,6 @@ body: |
; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = reassoc G_FADD [[COPY2]], [[FMUL]]
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
- ; GFX10-UNSAFE-LABEL: name: test_f32_add_mul_rhs
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s32) = COPY $vgpr2
@@ -205,6 +175,60 @@ body: |
...
---
+name: test_f32_add_mul_rhs_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; GFX9-LABEL: name: test_f32_add_mul_rhs_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX9-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX9-DENORM-LABEL: name: test_f32_add_mul_rhs_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-LABEL: name: test_f32_add_mul_rhs_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-DENORM-LABEL: name: test_f32_add_mul_rhs_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(s32) = COPY $vgpr2
+ %4:_(s32) = reassoc contract G_FMUL %0, %1
+ %5:_(s32) = reassoc contract G_FADD %2, %4
+ $vgpr0 = COPY %5(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+...
+
+---
name: test_half_add_mul
body: |
bb.1.entry:
@@ -225,20 +249,6 @@ body: |
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
;
- ; GFX9-CONTRACT-LABEL: name: test_half_add_mul
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX9-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
; GFX9-DENORM-LABEL: name: test_half_add_mul
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX9-DENORM-NEXT: {{ $}}
@@ -254,20 +264,6 @@ body: |
; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
;
- ; GFX9-UNSAFE-LABEL: name: test_half_add_mul
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
; GFX10-LABEL: name: test_half_add_mul
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-NEXT: {{ $}}
@@ -283,20 +279,6 @@ body: |
; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
;
- ; GFX10-CONTRACT-LABEL: name: test_half_add_mul
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX10-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
; GFX10-DENORM-LABEL: name: test_half_add_mul
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-DENORM-NEXT: {{ $}}
@@ -311,20 +293,6 @@ body: |
; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
- ; GFX10-UNSAFE-LABEL: name: test_half_add_mul
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX10-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
%4:_(s32) = COPY $vgpr0
%0:_(s16) = G_TRUNC %4(s32)
%5:_(s32) = COPY $vgpr1
@@ -339,6 +307,81 @@ body: |
...
---
+name: test_half_add_mul_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; GFX9-LABEL: name: test_half_add_mul_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+ ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+ ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX9-DENORM-LABEL: name: test_half_add_mul_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+ ; GFX9-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-LABEL: name: test_half_add_mul_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+ ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+ ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-DENORM-LABEL: name: test_half_add_mul_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+ ; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ %4:_(s32) = COPY $vgpr0
+ %0:_(s16) = G_TRUNC %4(s32)
+ %5:_(s32) = COPY $vgpr1
+ %1:_(s16) = G_TRUNC %5(s32)
+ %6:_(s32) = COPY $vgpr2
+ %2:_(s16) = G_TRUNC %6(s32)
+ %7:_(s16) = reassoc contract G_FMUL %0, %1
+ %8:_(s16) = reassoc contract G_FADD %7, %2
+ %10:_(s32) = G_ANYEXT %8(s16)
+ $vgpr0 = COPY %10(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+...
+
+
+---
name: test_half_add_mul_rhs
body: |
bb.1.entry:
@@ -359,20 +402,6 @@ body: |
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
;
- ; GFX9-CONTRACT-LABEL: name: test_half_add_mul_rhs
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX9-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
; GFX9-DENORM-LABEL: name: test_half_add_mul_rhs
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX9-DENORM-NEXT: {{ $}}
@@ -388,20 +417,6 @@ body: |
; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
;
- ; GFX9-UNSAFE-LABEL: name: test_half_add_mul_rhs
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
; GFX10-LABEL: name: test_half_add_mul_rhs
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-NEXT: {{ $}}
@@ -417,20 +432,6 @@ body: |
; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
;
- ; GFX10-CONTRACT-LABEL: name: test_half_add_mul_rhs
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX10-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
; GFX10-DENORM-LABEL: name: test_half_add_mul_rhs
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-DENORM-NEXT: {{ $}}
@@ -445,20 +446,84 @@ body: |
; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ %4:_(s32) = COPY $vgpr0
+ %0:_(s16) = G_TRUNC %4(s32)
+ %5:_(s32) = COPY $vgpr1
+ %1:_(s16) = G_TRUNC %5(s32)
+ %6:_(s32) = COPY $vgpr2
+ %2:_(s16) = G_TRUNC %6(s32)
+ %7:_(s16) = reassoc G_FMUL %0, %1
+ %8:_(s16) = reassoc G_FADD %2, %7
+ %10:_(s32) = G_ANYEXT %8(s16)
+ $vgpr0 = COPY %10(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+...
+
+---
+name: test_half_add_mul_rhs_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; GFX9-LABEL: name: test_half_add_mul_rhs_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s16) = reassoc G_FMUL [[TRUNC]], [[TRUNC1]]
+ ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s16) = reassoc G_FADD [[TRUNC2]], [[FMUL]]
+ ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
+ ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX9-DENORM-LABEL: name: test_half_add_mul_rhs_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(s16) = reassoc G_FMUL [[TRUNC]], [[TRUNC1]]
+ ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s16) = reassoc G_FADD [[TRUNC2]], [[FMUL]]
+ ; GFX9-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
;
- ; GFX10-UNSAFE-LABEL: name: test_half_add_mul_rhs
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX10-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ; GFX10-LABEL: name: test_half_add_mul_rhs_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s16) = reassoc G_FMUL [[TRUNC]], [[TRUNC1]]
+ ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s16) = reassoc G_FADD [[TRUNC2]], [[FMUL]]
+ ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
+ ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-DENORM-LABEL: name: test_half_add_mul_rhs_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(s16) = reassoc G_FMUL [[TRUNC]], [[TRUNC1]]
+ ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s16) = reassoc G_FADD [[TRUNC2]], [[FMUL]]
+ ; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
%4:_(s32) = COPY $vgpr0
%0:_(s16) = G_TRUNC %4(s32)
%5:_(s32) = COPY $vgpr1
@@ -497,24 +562,6 @@ body: |
; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX9-CONTRACT-LABEL: name: test_double_add_mul
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX9-DENORM-LABEL: name: test_double_add_mul
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX9-DENORM-NEXT: {{ $}}
@@ -534,24 +581,6 @@ body: |
; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX9-UNSAFE-LABEL: name: test_double_add_mul
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX10-LABEL: name: test_double_add_mul
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-NEXT: {{ $}}
@@ -571,24 +600,6 @@ body: |
; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX10-CONTRACT-LABEL: name: test_double_add_mul
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX10-DENORM-LABEL: name: test_double_add_mul
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-DENORM-NEXT: {{ $}}
@@ -607,24 +618,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
- ; GFX10-UNSAFE-LABEL: name: test_double_add_mul
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
%4:_(s32) = COPY $vgpr0
%5:_(s32) = COPY $vgpr1
%0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
@@ -643,6 +636,100 @@ body: |
...
---
+name: test_double_add_mul_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; GFX9-LABEL: name: test_double_add_mul_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX9-DENORM-LABEL: name: test_double_add_mul_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-LABEL: name: test_double_add_mul_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-DENORM-LABEL: name: test_double_add_mul_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ %4:_(s32) = COPY $vgpr0
+ %5:_(s32) = COPY $vgpr1
+ %0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+ %6:_(s32) = COPY $vgpr2
+ %7:_(s32) = COPY $vgpr3
+ %1:_(s64) = G_MERGE_VALUES %6(s32), %7(s32)
+ %8:_(s32) = COPY $vgpr4
+ %9:_(s32) = COPY $vgpr5
+ %2:_(s64) = G_MERGE_VALUES %8(s32), %9(s32)
+ %10:_(s64) = reassoc contract G_FMUL %0, %1
+ %11:_(s64) = reassoc contract G_FADD %10, %2
+ %13:_(s32), %14:_(s32) = G_UNMERGE_VALUES %11(s64)
+ $vgpr0 = COPY %13(s32)
+ $vgpr1 = COPY %14(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
+
+---
name: test_double_add_mul_rhs
body: |
bb.1.entry:
@@ -667,24 +754,6 @@ body: |
; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX9-CONTRACT-LABEL: name: test_double_add_mul_rhs
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX9-DENORM-LABEL: name: test_double_add_mul_rhs
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX9-DENORM-NEXT: {{ $}}
@@ -704,24 +773,6 @@ body: |
; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX9-UNSAFE-LABEL: name: test_double_add_mul_rhs
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX10-LABEL: name: test_double_add_mul_rhs
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-NEXT: {{ $}}
@@ -741,24 +792,6 @@ body: |
; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX10-CONTRACT-LABEL: name: test_double_add_mul_rhs
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX10-DENORM-LABEL: name: test_double_add_mul_rhs
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-DENORM-NEXT: {{ $}}
@@ -777,24 +810,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
- ; GFX10-UNSAFE-LABEL: name: test_double_add_mul_rhs
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
%4:_(s32) = COPY $vgpr0
%5:_(s32) = COPY $vgpr1
%0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
@@ -813,6 +828,100 @@ body: |
...
---
+name: test_double_add_mul_rhs_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; GFX9-LABEL: name: test_double_add_mul_rhs_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX9-DENORM-LABEL: name: test_double_add_mul_rhs_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-LABEL: name: test_double_add_mul_rhs_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-DENORM-LABEL: name: test_double_add_mul_rhs_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ %4:_(s32) = COPY $vgpr0
+ %5:_(s32) = COPY $vgpr1
+ %0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+ %6:_(s32) = COPY $vgpr2
+ %7:_(s32) = COPY $vgpr3
+ %1:_(s64) = G_MERGE_VALUES %6(s32), %7(s32)
+ %8:_(s32) = COPY $vgpr4
+ %9:_(s32) = COPY $vgpr5
+ %2:_(s64) = G_MERGE_VALUES %8(s32), %9(s32)
+ %10:_(s64) = reassoc contract G_FMUL %0, %1
+ %11:_(s64) = reassoc contract G_FADD %2, %10
+ %13:_(s32), %14:_(s32) = G_UNMERGE_VALUES %11(s64)
+ $vgpr0 = COPY %13(s32)
+ $vgpr1 = COPY %14(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
+
+---
name: test_4xfloat_add_mul
body: |
bb.1.entry:
@@ -845,32 +954,6 @@ body: |
; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
- ; GFX9-CONTRACT-LABEL: name: test_4xfloat_add_mul
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX9-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX9-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
- ;
; GFX9-DENORM-LABEL: name: test_4xfloat_add_mul
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
; GFX9-DENORM-NEXT: {{ $}}
@@ -898,32 +981,6 @@ body: |
; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
- ; GFX9-UNSAFE-LABEL: name: test_4xfloat_add_mul
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX9-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX9-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
- ;
; GFX10-LABEL: name: test_4xfloat_add_mul
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
; GFX10-NEXT: {{ $}}
@@ -951,32 +1008,6 @@ body: |
; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
- ; GFX10-CONTRACT-LABEL: name: test_4xfloat_add_mul
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX10-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX10-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
- ;
; GFX10-DENORM-LABEL: name: test_4xfloat_add_mul
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
; GFX10-DENORM-NEXT: {{ $}}
@@ -1003,32 +1034,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
- ;
- ; GFX10-UNSAFE-LABEL: name: test_4xfloat_add_mul
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX10-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX10-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%4:_(s32) = COPY $vgpr0
%5:_(s32) = COPY $vgpr1
%6:_(s32) = COPY $vgpr2
@@ -1055,6 +1060,140 @@ body: |
...
---
+name: test_4xfloat_add_mul_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+
+ ; GFX9-LABEL: name: test_4xfloat_add_mul_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ;
+ ; GFX9-DENORM-LABEL: name: test_4xfloat_add_mul_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX9-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX9-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ;
+ ; GFX10-LABEL: name: test_4xfloat_add_mul_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ;
+ ; GFX10-DENORM-LABEL: name: test_4xfloat_add_mul_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX10-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX10-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ %4:_(s32) = COPY $vgpr0
+ %5:_(s32) = COPY $vgpr1
+ %6:_(s32) = COPY $vgpr2
+ %7:_(s32) = COPY $vgpr3
+ %0:_(<4 x s32>) = G_BUILD_VECTOR %4(s32), %5(s32), %6(s32), %7(s32)
+ %8:_(s32) = COPY $vgpr4
+ %9:_(s32) = COPY $vgpr5
+ %10:_(s32) = COPY $vgpr6
+ %11:_(s32) = COPY $vgpr7
+ %1:_(<4 x s32>) = G_BUILD_VECTOR %8(s32), %9(s32), %10(s32), %11(s32)
+ %12:_(s32) = COPY $vgpr8
+ %13:_(s32) = COPY $vgpr9
+ %14:_(s32) = COPY $vgpr10
+ %15:_(s32) = COPY $vgpr11
+ %2:_(<4 x s32>) = G_BUILD_VECTOR %12(s32), %13(s32), %14(s32), %15(s32)
+ %16:_(<4 x s32>) = reassoc contract G_FMUL %0, %1
+ %17:_(<4 x s32>) = reassoc contract G_FADD %16, %2
+ %19:_(s32), %20:_(s32), %21:_(s32), %22:_(s32) = G_UNMERGE_VALUES %17(<4 x s32>)
+ $vgpr0 = COPY %19(s32)
+ $vgpr1 = COPY %20(s32)
+ $vgpr2 = COPY %21(s32)
+ $vgpr3 = COPY %22(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+...
+
+---
name: test_3xfloat_add_mul_rhs
body: |
bb.1.entry:
@@ -1083,28 +1222,6 @@ body: |
; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
;
- ; GFX9-CONTRACT-LABEL: name: test_3xfloat_add_mul_rhs
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
- ;
; GFX9-DENORM-LABEL: name: test_3xfloat_add_mul_rhs
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; GFX9-DENORM-NEXT: {{ $}}
@@ -1128,28 +1245,6 @@ body: |
; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
;
- ; GFX9-UNSAFE-LABEL: name: test_3xfloat_add_mul_rhs
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
- ;
; GFX10-LABEL: name: test_3xfloat_add_mul_rhs
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; GFX10-NEXT: {{ $}}
@@ -1173,28 +1268,6 @@ body: |
; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
;
- ; GFX10-CONTRACT-LABEL: name: test_3xfloat_add_mul_rhs
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
- ;
; GFX10-DENORM-LABEL: name: test_3xfloat_add_mul_rhs
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; GFX10-DENORM-NEXT: {{ $}}
@@ -1217,28 +1290,124 @@ body: |
; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ %4:_(s32) = COPY $vgpr0
+ %5:_(s32) = COPY $vgpr1
+ %6:_(s32) = COPY $vgpr2
+ %0:_(<3 x s32>) = G_BUILD_VECTOR %4(s32), %5(s32), %6(s32)
+ %7:_(s32) = COPY $vgpr3
+ %8:_(s32) = COPY $vgpr4
+ %9:_(s32) = COPY $vgpr5
+ %1:_(<3 x s32>) = G_BUILD_VECTOR %7(s32), %8(s32), %9(s32)
+ %10:_(s32) = COPY $vgpr6
+ %11:_(s32) = COPY $vgpr7
+ %12:_(s32) = COPY $vgpr8
+ %2:_(<3 x s32>) = G_BUILD_VECTOR %10(s32), %11(s32), %12(s32)
+ %13:_(<3 x s32>) = reassoc G_FMUL %0, %1
+ %14:_(<3 x s32>) = reassoc G_FADD %2, %13
+ %16:_(s32), %17:_(s32), %18:_(s32) = G_UNMERGE_VALUES %14(<3 x s32>)
+ $vgpr0 = COPY %16(s32)
+ $vgpr1 = COPY %17(s32)
+ $vgpr2 = COPY %18(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+...
+
+---
+name: test_3xfloat_add_mul_rhs_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+
+ ; GFX9-LABEL: name: test_3xfloat_add_mul_rhs_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+ ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ ;
+ ; GFX9-DENORM-LABEL: name: test_3xfloat_add_mul_rhs_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+ ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ ;
+ ; GFX10-LABEL: name: test_3xfloat_add_mul_rhs_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+ ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
;
- ; GFX10-UNSAFE-LABEL: name: test_3xfloat_add_mul_rhs
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ ; GFX10-DENORM-LABEL: name: test_3xfloat_add_mul_rhs_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+ ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%4:_(s32) = COPY $vgpr0
%5:_(s32) = COPY $vgpr1
%6:_(s32) = COPY $vgpr2
@@ -1285,24 +1454,6 @@ body: |
; GFX9-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX9-CONTRACT-LABEL: name: test_4xhalf_add_mul
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX9-DENORM-LABEL: name: test_4xhalf_add_mul
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX9-DENORM-NEXT: {{ $}}
@@ -1322,24 +1473,6 @@ body: |
; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX9-UNSAFE-LABEL: name: test_4xhalf_add_mul
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX10-LABEL: name: test_4xhalf_add_mul
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-NEXT: {{ $}}
@@ -1359,24 +1492,6 @@ body: |
; GFX10-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX10-CONTRACT-LABEL: name: test_4xhalf_add_mul
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX10-DENORM-LABEL: name: test_4xhalf_add_mul
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-DENORM-NEXT: {{ $}}
@@ -1395,24 +1510,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
- ; GFX10-UNSAFE-LABEL: name: test_4xhalf_add_mul
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
%4:_(<2 x s16>) = COPY $vgpr0
%5:_(<2 x s16>) = COPY $vgpr1
%0:_(<4 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>)
@@ -1431,6 +1528,100 @@ body: |
...
---
+name: test_4xhalf_add_mul_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; GFX9-LABEL: name: test_4xhalf_add_mul_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX9-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX9-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]]
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX9-DENORM-LABEL: name: test_4xhalf_add_mul_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]]
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-LABEL: name: test_4xhalf_add_mul_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX10-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX10-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]]
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-DENORM-LABEL: name: test_4xhalf_add_mul_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]]
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ %4:_(<2 x s16>) = COPY $vgpr0
+ %5:_(<2 x s16>) = COPY $vgpr1
+ %0:_(<4 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>)
+ %6:_(<2 x s16>) = COPY $vgpr2
+ %7:_(<2 x s16>) = COPY $vgpr3
+ %1:_(<4 x s16>) = G_CONCAT_VECTORS %6(<2 x s16>), %7(<2 x s16>)
+ %8:_(<2 x s16>) = COPY $vgpr4
+ %9:_(<2 x s16>) = COPY $vgpr5
+ %2:_(<4 x s16>) = G_CONCAT_VECTORS %8(<2 x s16>), %9(<2 x s16>)
+ %10:_(<4 x s16>) = reassoc contract G_FMUL %0, %1
+ %11:_(<4 x s16>) = reassoc contract G_FADD %10, %2
+ %13:_(<2 x s16>), %14:_(<2 x s16>) = G_UNMERGE_VALUES %11(<4 x s16>)
+ $vgpr0 = COPY %13(<2 x s16>)
+ $vgpr1 = COPY %14(<2 x s16>)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
+
+---
name: test_3xhalf_add_mul_rhs
body: |
bb.1.entry:
@@ -1461,30 +1652,6 @@ body: |
; GFX9-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX9-CONTRACT-LABEL: name: test_3xhalf_add_mul_rhs
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
- ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]]
- ; GFX9-CONTRACT-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
- ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>)
- ; GFX9-CONTRACT-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX9-DENORM-LABEL: name: test_3xhalf_add_mul_rhs
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX9-DENORM-NEXT: {{ $}}
@@ -1510,30 +1677,6 @@ body: |
; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX9-UNSAFE-LABEL: name: test_3xhalf_add_mul_rhs
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
- ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]]
- ; GFX9-UNSAFE-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
- ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>)
- ; GFX9-UNSAFE-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX10-LABEL: name: test_3xhalf_add_mul_rhs
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-NEXT: {{ $}}
@@ -1559,30 +1702,6 @@ body: |
; GFX10-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX10-CONTRACT-LABEL: name: test_3xhalf_add_mul_rhs
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
- ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]]
- ; GFX10-CONTRACT-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
- ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>)
- ; GFX10-CONTRACT-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX10-DENORM-LABEL: name: test_3xhalf_add_mul_rhs
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-DENORM-NEXT: {{ $}}
@@ -1607,30 +1726,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
- ; GFX10-UNSAFE-LABEL: name: test_3xhalf_add_mul_rhs
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
- ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]]
- ; GFX10-UNSAFE-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
- ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>)
- ; GFX10-UNSAFE-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
%4:_(<2 x s16>) = COPY $vgpr0
%5:_(<2 x s16>) = COPY $vgpr1
%10:_(<2 x s16>) = G_IMPLICIT_DEF
@@ -1655,6 +1750,130 @@ body: |
...
---
+name: test_3xhalf_add_mul_rhs_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; GFX9-LABEL: name: test_3xhalf_add_mul_rhs_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+ ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX9-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX9-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]]
+ ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
+ ; GFX9-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>)
+ ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX9-DENORM-LABEL: name: test_3xhalf_add_mul_rhs_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+ ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]]
+ ; GFX9-DENORM-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
+ ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>)
+ ; GFX9-DENORM-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-LABEL: name: test_3xhalf_add_mul_rhs_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+ ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX10-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX10-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX10-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]]
+ ; GFX10-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
+ ; GFX10-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>)
+ ; GFX10-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-DENORM-LABEL: name: test_3xhalf_add_mul_rhs_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+ ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]]
+ ; GFX10-DENORM-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
+ ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>)
+ ; GFX10-DENORM-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ %4:_(<2 x s16>) = COPY $vgpr0
+ %5:_(<2 x s16>) = COPY $vgpr1
+ %10:_(<2 x s16>) = G_IMPLICIT_DEF
+ %11:_(<6 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>), %10(<2 x s16>)
+ %0:_(<3 x s16>), %12:_(<3 x s16>) = G_UNMERGE_VALUES %11(<6 x s16>)
+ %6:_(<2 x s16>) = COPY $vgpr2
+ %7:_(<2 x s16>) = COPY $vgpr3
+ %13:_(<6 x s16>) = G_CONCAT_VECTORS %6(<2 x s16>), %7(<2 x s16>), %10(<2 x s16>)
+ %1:_(<3 x s16>), %14:_(<3 x s16>) = G_UNMERGE_VALUES %13(<6 x s16>)
+ %8:_(<2 x s16>) = COPY $vgpr4
+ %9:_(<2 x s16>) = COPY $vgpr5
+ %15:_(<6 x s16>) = G_CONCAT_VECTORS %8(<2 x s16>), %9(<2 x s16>), %10(<2 x s16>)
+ %2:_(<3 x s16>), %16:_(<3 x s16>) = G_UNMERGE_VALUES %15(<6 x s16>)
+ %17:_(<3 x s16>) = reassoc contract G_FMUL %0, %1
+ %18:_(<3 x s16>) = reassoc contract G_FADD %2, %17
+ %22:_(<3 x s16>) = G_IMPLICIT_DEF
+ %23:_(<6 x s16>) = G_CONCAT_VECTORS %18(<3 x s16>), %22(<3 x s16>)
+ %20:_(<2 x s16>), %21:_(<2 x s16>), %24:_(<2 x s16>) = G_UNMERGE_VALUES %23(<6 x s16>)
+ $vgpr0 = COPY %20(<2 x s16>)
+ $vgpr1 = COPY %21(<2 x s16>)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
+
+---
name: test_4xdouble_add_mul
body: |
bb.1.entry:
@@ -1715,60 +1934,6 @@ body: |
; GFX9-NEXT: $vgpr7 = COPY [[UV7]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
;
- ; GFX9-CONTRACT-LABEL: name: test_4xdouble_add_mul
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
- ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX9-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX9-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX9-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX9-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX9-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX9-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX9-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
- ; GFX9-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX9-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX9-CONTRACT-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
- ; GFX9-CONTRACT-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
- ; GFX9-CONTRACT-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
- ; GFX9-CONTRACT-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
- ; GFX9-CONTRACT-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
- ; GFX9-CONTRACT-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
- ; GFX9-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr6 = COPY [[UV6]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr7 = COPY [[UV7]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
- ;
; GFX9-DENORM-LABEL: name: test_4xdouble_add_mul
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX9-DENORM-NEXT: {{ $}}
@@ -1824,60 +1989,6 @@ body: |
; GFX9-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
;
- ; GFX9-UNSAFE-LABEL: name: test_4xdouble_add_mul
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
- ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX9-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX9-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX9-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX9-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX9-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX9-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX9-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
- ; GFX9-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX9-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX9-UNSAFE-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
- ; GFX9-UNSAFE-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
- ; GFX9-UNSAFE-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
- ; GFX9-UNSAFE-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
- ; GFX9-UNSAFE-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
- ; GFX9-UNSAFE-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
- ; GFX9-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr6 = COPY [[UV6]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr7 = COPY [[UV7]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
- ;
; GFX10-LABEL: name: test_4xdouble_add_mul
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX10-NEXT: {{ $}}
@@ -1933,60 +2044,6 @@ body: |
; GFX10-NEXT: $vgpr7 = COPY [[UV7]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
;
- ; GFX10-CONTRACT-LABEL: name: test_4xdouble_add_mul
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
- ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX10-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX10-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX10-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX10-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX10-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX10-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX10-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
- ; GFX10-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX10-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX10-CONTRACT-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
- ; GFX10-CONTRACT-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
- ; GFX10-CONTRACT-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
- ; GFX10-CONTRACT-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
- ; GFX10-CONTRACT-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
- ; GFX10-CONTRACT-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
- ; GFX10-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr6 = COPY [[UV6]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr7 = COPY [[UV7]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
- ;
; GFX10-DENORM-LABEL: name: test_4xdouble_add_mul
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX10-DENORM-NEXT: {{ $}}
@@ -2041,60 +2098,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr6 = COPY [[UV6]](s32)
; GFX10-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
- ;
- ; GFX10-UNSAFE-LABEL: name: test_4xdouble_add_mul
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
- ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX10-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX10-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX10-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX10-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX10-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX10-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX10-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
- ; GFX10-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX10-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX10-UNSAFE-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
- ; GFX10-UNSAFE-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
- ; GFX10-UNSAFE-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
- ; GFX10-UNSAFE-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
- ; GFX10-UNSAFE-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
- ; GFX10-UNSAFE-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
- ; GFX10-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr6 = COPY [[UV6]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr7 = COPY [[UV7]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
%4:_(s32) = COPY $vgpr0
%5:_(s32) = COPY $vgpr1
%6:_(s32) = COPY $vgpr2
@@ -2149,6 +2152,280 @@ body: |
...
---
+name: test_4xdouble_add_mul_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+ ; GFX9-LABEL: name: test_4xdouble_add_mul_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+ ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX9-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX9-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX9-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
+ ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
+ ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
+ ; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
+ ; GFX9-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
+ ; GFX9-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
+ ; GFX9-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX9-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
+ ; GFX9-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
+ ; GFX9-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
+ ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX9-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX9-NEXT: $vgpr6 = COPY [[UV6]](s32)
+ ; GFX9-NEXT: $vgpr7 = COPY [[UV7]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX9-DENORM-LABEL: name: test_4xdouble_add_mul_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+ ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX9-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX9-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX9-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX9-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX9-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX9-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX9-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX9-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX9-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX9-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
+ ; GFX9-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX9-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX9-DENORM-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
+ ; GFX9-DENORM-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
+ ; GFX9-DENORM-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
+ ; GFX9-DENORM-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
+ ; GFX9-DENORM-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
+ ; GFX9-DENORM-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
+ ; GFX9-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX9-DENORM-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
+ ; GFX9-DENORM-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
+ ; GFX9-DENORM-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr6 = COPY [[UV6]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX10-LABEL: name: test_4xdouble_add_mul_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+ ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX10-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX10-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX10-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX10-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
+ ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
+ ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
+ ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
+ ; GFX10-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
+ ; GFX10-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX10-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
+ ; GFX10-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
+ ; GFX10-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
+ ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX10-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX10-NEXT: $vgpr6 = COPY [[UV6]](s32)
+ ; GFX10-NEXT: $vgpr7 = COPY [[UV7]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX10-DENORM-LABEL: name: test_4xdouble_add_mul_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+ ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX10-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX10-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX10-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX10-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX10-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX10-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX10-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX10-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX10-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX10-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
+ ; GFX10-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX10-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX10-DENORM-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
+ ; GFX10-DENORM-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
+ ; GFX10-DENORM-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
+ ; GFX10-DENORM-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
+ ; GFX10-DENORM-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
+ ; GFX10-DENORM-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
+ ; GFX10-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX10-DENORM-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
+ ; GFX10-DENORM-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
+ ; GFX10-DENORM-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr6 = COPY [[UV6]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ %4:_(s32) = COPY $vgpr0
+ %5:_(s32) = COPY $vgpr1
+ %6:_(s32) = COPY $vgpr2
+ %7:_(s32) = COPY $vgpr3
+ %8:_(s32) = COPY $vgpr4
+ %9:_(s32) = COPY $vgpr5
+ %10:_(s32) = COPY $vgpr6
+ %11:_(s32) = COPY $vgpr7
+ %28:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+ %29:_(s64) = G_MERGE_VALUES %6(s32), %7(s32)
+ %30:_(s64) = G_MERGE_VALUES %8(s32), %9(s32)
+ %31:_(s64) = G_MERGE_VALUES %10(s32), %11(s32)
+ %0:_(<4 x s64>) = G_BUILD_VECTOR %28(s64), %29(s64), %30(s64), %31(s64)
+ %12:_(s32) = COPY $vgpr8
+ %13:_(s32) = COPY $vgpr9
+ %14:_(s32) = COPY $vgpr10
+ %15:_(s32) = COPY $vgpr11
+ %16:_(s32) = COPY $vgpr12
+ %17:_(s32) = COPY $vgpr13
+ %18:_(s32) = COPY $vgpr14
+ %19:_(s32) = COPY $vgpr15
+ %32:_(s64) = G_MERGE_VALUES %12(s32), %13(s32)
+ %33:_(s64) = G_MERGE_VALUES %14(s32), %15(s32)
+ %34:_(s64) = G_MERGE_VALUES %16(s32), %17(s32)
+ %35:_(s64) = G_MERGE_VALUES %18(s32), %19(s32)
+ %1:_(<4 x s64>) = G_BUILD_VECTOR %32(s64), %33(s64), %34(s64), %35(s64)
+ %20:_(s32) = COPY $vgpr16
+ %21:_(s32) = COPY $vgpr17
+ %22:_(s32) = COPY $vgpr18
+ %23:_(s32) = COPY $vgpr19
+ %24:_(s32) = COPY $vgpr20
+ %25:_(s32) = COPY $vgpr21
+ %26:_(s32) = COPY $vgpr22
+ %27:_(s32) = COPY $vgpr23
+ %36:_(s64) = G_MERGE_VALUES %20(s32), %21(s32)
+ %37:_(s64) = G_MERGE_VALUES %22(s32), %23(s32)
+ %38:_(s64) = G_MERGE_VALUES %24(s32), %25(s32)
+ %39:_(s64) = G_MERGE_VALUES %26(s32), %27(s32)
+ %2:_(<4 x s64>) = G_BUILD_VECTOR %36(s64), %37(s64), %38(s64), %39(s64)
+ %40:_(<4 x s64>) = reassoc contract G_FMUL %0, %1
+ %41:_(<4 x s64>) = reassoc contract G_FADD %40, %2
+ %43:_(s32), %44:_(s32), %45:_(s32), %46:_(s32), %47:_(s32), %48:_(s32), %49:_(s32), %50:_(s32) = G_UNMERGE_VALUES %41(<4 x s64>)
+ $vgpr0 = COPY %43(s32)
+ $vgpr1 = COPY %44(s32)
+ $vgpr2 = COPY %45(s32)
+ $vgpr3 = COPY %46(s32)
+ $vgpr4 = COPY %47(s32)
+ $vgpr5 = COPY %48(s32)
+ $vgpr6 = COPY %49(s32)
+ $vgpr7 = COPY %50(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+...
+
+---
name: test_3xdouble_add_mul_rhs
body: |
bb.1.entry:
@@ -2198,49 +2475,6 @@ body: |
; GFX9-NEXT: $vgpr5 = COPY [[UV5]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
;
- ; GFX9-CONTRACT-LABEL: name: test_3xdouble_add_mul_rhs
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
- ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX9-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX9-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX9-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
- ; GFX9-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX9-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX9-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX9-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX9-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX9-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX9-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
- ;
; GFX9-DENORM-LABEL: name: test_3xdouble_add_mul_rhs
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
; GFX9-DENORM-NEXT: {{ $}}
@@ -2285,49 +2519,6 @@ body: |
; GFX9-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
;
- ; GFX9-UNSAFE-LABEL: name: test_3xdouble_add_mul_rhs
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
- ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX9-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX9-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX9-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
- ; GFX9-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX9-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX9-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX9-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX9-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX9-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX9-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
- ;
; GFX10-LABEL: name: test_3xdouble_add_mul_rhs
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
; GFX10-NEXT: {{ $}}
@@ -2372,49 +2563,6 @@ body: |
; GFX10-NEXT: $vgpr5 = COPY [[UV5]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
;
- ; GFX10-CONTRACT-LABEL: name: test_3xdouble_add_mul_rhs
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
- ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX10-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX10-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX10-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
- ; GFX10-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX10-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX10-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX10-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX10-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX10-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX10-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
- ;
; GFX10-DENORM-LABEL: name: test_3xdouble_add_mul_rhs
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
; GFX10-DENORM-NEXT: {{ $}}
@@ -2458,49 +2606,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
; GFX10-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
- ;
- ; GFX10-UNSAFE-LABEL: name: test_3xdouble_add_mul_rhs
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
- ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX10-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX10-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX10-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
- ; GFX10-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX10-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX10-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX10-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX10-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX10-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX10-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
%4:_(s32) = COPY $vgpr0
%5:_(s32) = COPY $vgpr1
%6:_(s32) = COPY $vgpr2
@@ -2542,3 +2647,222 @@ body: |
$vgpr5 = COPY %39(s32)
S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
...
+
+---
+name: test_3xdouble_add_mul_rhs_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+
+ ; GFX9-LABEL: name: test_3xdouble_add_mul_rhs_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
+ ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX9-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX9-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX9-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX9-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX9-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+ ;
+ ; GFX9-DENORM-LABEL: name: test_3xdouble_add_mul_rhs_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
+ ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX9-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX9-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX9-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX9-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX9-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
+ ; GFX9-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX9-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX9-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX9-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX9-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX9-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX9-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX9-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX9-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+ ;
+ ; GFX10-LABEL: name: test_3xdouble_add_mul_rhs_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX10-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX10-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
+ ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX10-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX10-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX10-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX10-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+ ;
+ ; GFX10-DENORM-LABEL: name: test_3xdouble_add_mul_rhs_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
+ ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX10-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX10-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX10-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX10-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
+ ; GFX10-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX10-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX10-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX10-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX10-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX10-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX10-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX10-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX10-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+ %4:_(s32) = COPY $vgpr0
+ %5:_(s32) = COPY $vgpr1
+ %6:_(s32) = COPY $vgpr2
+ %7:_(s32) = COPY $vgpr3
+ %8:_(s32) = COPY $vgpr4
+ %9:_(s32) = COPY $vgpr5
+ %22:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+ %23:_(s64) = G_MERGE_VALUES %6(s32), %7(s32)
+ %24:_(s64) = G_MERGE_VALUES %8(s32), %9(s32)
+ %0:_(<3 x s64>) = G_BUILD_VECTOR %22(s64), %23(s64), %24(s64)
+ %10:_(s32) = COPY $vgpr6
+ %11:_(s32) = COPY $vgpr7
+ %12:_(s32) = COPY $vgpr8
+ %13:_(s32) = COPY $vgpr9
+ %14:_(s32) = COPY $vgpr10
+ %15:_(s32) = COPY $vgpr11
+ %25:_(s64) = G_MERGE_VALUES %10(s32), %11(s32)
+ %26:_(s64) = G_MERGE_VALUES %12(s32), %13(s32)
+ %27:_(s64) = G_MERGE_VALUES %14(s32), %15(s32)
+ %1:_(<3 x s64>) = G_BUILD_VECTOR %25(s64), %26(s64), %27(s64)
+ %16:_(s32) = COPY $vgpr12
+ %17:_(s32) = COPY $vgpr13
+ %18:_(s32) = COPY $vgpr14
+ %19:_(s32) = COPY $vgpr15
+ %20:_(s32) = COPY $vgpr16
+ %21:_(s32) = COPY $vgpr17
+ %28:_(s64) = G_MERGE_VALUES %16(s32), %17(s32)
+ %29:_(s64) = G_MERGE_VALUES %18(s32), %19(s32)
+ %30:_(s64) = G_MERGE_VALUES %20(s32), %21(s32)
+ %2:_(<3 x s64>) = G_BUILD_VECTOR %28(s64), %29(s64), %30(s64)
+ %31:_(<3 x s64>) = reassoc contract G_FMUL %0, %1
+ %32:_(<3 x s64>) = reassoc contract G_FADD %2, %31
+ %34:_(s32), %35:_(s32), %36:_(s32), %37:_(s32), %38:_(s32), %39:_(s32) = G_UNMERGE_VALUES %32(<3 x s64>)
+ $vgpr0 = COPY %34(s32)
+ $vgpr1 = COPY %35(s32)
+ $vgpr2 = COPY %36(s32)
+ $vgpr3 = COPY %37(s32)
+ $vgpr4 = COPY %38(s32)
+ $vgpr5 = COPY %39(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
index 24dd535..3f6e3d8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
@@ -2,11 +2,9 @@
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -fp-contract=fast < %s | FileCheck -check-prefix=GFX9-CONTRACT %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GFX9-UNSAFE %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GFX10-UNSAFE %s
define float @test_f32_add_mul(float %x, float %y, float %z) {
; GFX9-LABEL: test_f32_add_mul:
@@ -28,12 +26,6 @@ define float @test_f32_add_mul(float %x, float %y, float %z) {
; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v1, v2
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-UNSAFE-LABEL: test_f32_add_mul:
-; GFX9-UNSAFE: ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2
-; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-LABEL: test_f32_add_mul:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -52,7 +44,6 @@ define float @test_f32_add_mul(float %x, float %y, float %z) {
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-DENORM-NEXT: v_mad_f32 v0, v0, v1, v2
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-UNSAFE-LABEL: test_f32_add_mul:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -64,6 +55,58 @@ define float @test_f32_add_mul(float %x, float %y, float %z) {
ret float %b
}
+define float @test_f32_add_mul_contract(float %x, float %y, float %z) {
+; GFX9-LABEL: test_f32_add_mul_contract:
+; GFX9: ; %bb.0: ; %.entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_f32_add_mul_contract:
+; GFX9-CONTRACT: ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_f32_add_mul_contract:
+; GFX9-DENORM: ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v1, v2
+; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_f32_add_mul_contract:
+; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_f32_add_mul_contract:
+; GFX10-CONTRACT: ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_f32_add_mul_contract:
+; GFX10-DENORM: ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_f32_add_mul_contract:
+; GFX9-UNSAFE: ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_f32_add_mul_contract:
+; GFX10-UNSAFE: ; %bb.0: ; %.entry
+; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+.entry:
+ %a = fmul contract float %x, %y
+ %b = fadd contract float %a, %z
+ ret float %b
+}
+
define float @test_f32_add_mul_rhs(float %x, float %y, float %z) {
; GFX9-LABEL: test_f32_add_mul_rhs:
; GFX9: ; %bb.0: ; %.entry
@@ -84,12 +127,6 @@ define float @test_f32_add_mul_rhs(float %x, float %y, float %z) {
; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v1, v2
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-UNSAFE-LABEL: test_f32_add_mul_rhs:
-; GFX9-UNSAFE: ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2
-; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-LABEL: test_f32_add_mul_rhs:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -108,7 +145,6 @@ define float @test_f32_add_mul_rhs(float %x, float %y, float %z) {
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-DENORM-NEXT: v_mad_f32 v0, v0, v1, v2
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-UNSAFE-LABEL: test_f32_add_mul_rhs:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -120,6 +156,58 @@ define float @test_f32_add_mul_rhs(float %x, float %y, float %z) {
ret float %b
}
+define float @test_f32_add_mul_rhs_contract(float %x, float %y, float %z) {
+; GFX9-LABEL: test_f32_add_mul_rhs_contract:
+; GFX9: ; %bb.0: ; %.entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_f32_add_mul_rhs_contract:
+; GFX9-CONTRACT: ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_f32_add_mul_rhs_contract:
+; GFX9-DENORM: ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v1, v2
+; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_f32_add_mul_rhs_contract:
+; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_f32_add_mul_rhs_contract:
+; GFX10-CONTRACT: ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_f32_add_mul_rhs_contract:
+; GFX10-DENORM: ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_f32_add_mul_rhs_contract:
+; GFX9-UNSAFE: ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_f32_add_mul_rhs_contract:
+; GFX10-UNSAFE: ; %bb.0: ; %.entry
+; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+.entry:
+ %a = fmul contract float %x, %y
+ %b = fadd contract float %z, %a
+ ret float %b
+}
+
define float @test_add_mul_multiple_defs_z(float %x, float %y, ptr addrspace(1) %vec_ptr) {
; GFX9-LABEL: test_add_mul_multiple_defs_z:
; GFX9: ; %bb.0: ; %.entry
@@ -147,14 +235,6 @@ define float @test_add_mul_multiple_defs_z(float %x, float %y, ptr addrspace(1)
; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v2
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-UNSAFE-LABEL: test_add_mul_multiple_defs_z:
-; GFX9-UNSAFE: ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: global_load_dword v2, v[2:3], off offset:4
-; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0)
-; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2
-; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-LABEL: test_add_mul_multiple_defs_z:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -181,7 +261,6 @@ define float @test_add_mul_multiple_defs_z(float %x, float %y, ptr addrspace(1)
; GFX10-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1
; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v2
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-UNSAFE-LABEL: test_add_mul_multiple_defs_z:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -198,17 +277,16 @@ define float @test_add_mul_multiple_defs_z(float %x, float %y, ptr addrspace(1)
ret float %b
}
-define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, ptr addrspace(1) %vec_ptr) {
-; GFX9-LABEL: test_add_mul_rhs_multiple_defs_z:
+define float @test_add_mul_multiple_defs_z_contract(float %x, float %y, ptr addrspace(1) %vec_ptr) {
+; GFX9-LABEL: test_add_mul_multiple_defs_z_contract:
; GFX9: ; %bb.0: ; %.entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-CONTRACT-LABEL: test_add_mul_rhs_multiple_defs_z:
+; GFX9-CONTRACT-LABEL: test_add_mul_multiple_defs_z_contract:
; GFX9-CONTRACT: ; %bb.0: ; %.entry
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-CONTRACT-NEXT: global_load_dword v2, v[2:3], off offset:4
@@ -216,7 +294,7 @@ define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, ptr addrspace
; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-DENORM-LABEL: test_add_mul_rhs_multiple_defs_z:
+; GFX9-DENORM-LABEL: test_add_mul_multiple_defs_z_contract:
; GFX9-DENORM: ; %bb.0: ; %.entry
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-DENORM-NEXT: global_load_dword v2, v[2:3], off offset:4
@@ -225,13 +303,81 @@ define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, ptr addrspace
; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v2
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-UNSAFE-LABEL: test_add_mul_rhs_multiple_defs_z:
+; GFX10-LABEL: test_add_mul_multiple_defs_z_contract:
+; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v2, v[2:3], off offset:4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX10-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_add_mul_multiple_defs_z_contract:
+; GFX10-CONTRACT: ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT: global_load_dword v2, v[2:3], off offset:4
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_add_mul_multiple_defs_z_contract:
+; GFX10-DENORM: ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v2, v[2:3], off offset:4
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_add_mul_multiple_defs_z_contract:
; GFX9-UNSAFE: ; %bb.0: ; %.entry
; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-UNSAFE-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0)
; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_add_mul_multiple_defs_z_contract:
+; GFX10-UNSAFE: ; %bb.0: ; %.entry
+; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-UNSAFE-NEXT: global_load_dword v2, v[2:3], off offset:4
+; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0)
+; GFX10-UNSAFE-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX10-UNSAFE-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+.entry:
+ %a = fmul contract float %x, %y
+ %vec = load <2 x float>, ptr addrspace(1) %vec_ptr
+ %z = extractelement <2 x float> %vec, i64 1
+ %b = fadd contract float %a, %z
+ ret float %b
+}
+
+define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, ptr addrspace(1) %vec_ptr) {
+; GFX9-LABEL: test_add_mul_rhs_multiple_defs_z:
+; GFX9: ; %bb.0: ; %.entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4
+; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_add_mul_rhs_multiple_defs_z:
+; GFX9-CONTRACT: ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT: global_load_dword v2, v[2:3], off offset:4
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_add_mul_rhs_multiple_defs_z:
+; GFX9-DENORM: ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v2, v[2:3], off offset:4
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1
+; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_add_mul_rhs_multiple_defs_z:
; GFX10: ; %bb.0: ; %.entry
@@ -259,7 +405,6 @@ define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, ptr addrspace
; GFX10-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1
; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v2
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-UNSAFE-LABEL: test_add_mul_rhs_multiple_defs_z:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -296,12 +441,6 @@ define half @test_half_add_mul(half %x, half %y, half %z) {
; GFX9-DENORM-NEXT: v_mad_legacy_f16 v0, v0, v1, v2
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-UNSAFE-LABEL: test_half_add_mul:
-; GFX9-UNSAFE: ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: v_fma_f16 v0, v0, v1, v2
-; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-LABEL: test_half_add_mul:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -321,7 +460,6 @@ define half @test_half_add_mul(half %x, half %y, half %z) {
; GFX10-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX10-DENORM-NEXT: v_add_f16_e32 v0, v0, v2
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-UNSAFE-LABEL: test_half_add_mul:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -333,6 +471,59 @@ define half @test_half_add_mul(half %x, half %y, half %z) {
ret half %b
}
+define half @test_half_add_mul_contract(half %x, half %y, half %z) {
+; GFX9-LABEL: test_half_add_mul_contract:
+; GFX9: ; %bb.0: ; %.entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_half_add_mul_contract:
+; GFX9-CONTRACT: ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_half_add_mul_contract:
+; GFX9-DENORM: ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT: v_mad_legacy_f16 v0, v0, v1, v2
+; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_half_add_mul_contract:
+; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_half_add_mul_contract:
+; GFX10-CONTRACT: ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_half_add_mul_contract:
+; GFX10-DENORM: ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX10-DENORM-NEXT: v_add_f16_e32 v0, v0, v2
+; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_half_add_mul_contract:
+; GFX9-UNSAFE: ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_half_add_mul_contract:
+; GFX10-UNSAFE: ; %bb.0: ; %.entry
+; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-UNSAFE-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+.entry:
+ %a = fmul contract half %x, %y
+ %b = fadd contract half %a, %z
+ ret half %b
+}
+
define half @test_half_add_mul_rhs(half %x, half %y, half %z) {
; GFX9-LABEL: test_half_add_mul_rhs:
; GFX9: ; %bb.0: ; %.entry
@@ -353,12 +544,6 @@ define half @test_half_add_mul_rhs(half %x, half %y, half %z) {
; GFX9-DENORM-NEXT: v_mad_legacy_f16 v0, v0, v1, v2
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-UNSAFE-LABEL: test_half_add_mul_rhs:
-; GFX9-UNSAFE: ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: v_fma_f16 v0, v0, v1, v2
-; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-LABEL: test_half_add_mul_rhs:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -378,7 +563,6 @@ define half @test_half_add_mul_rhs(half %x, half %y, half %z) {
; GFX10-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX10-DENORM-NEXT: v_add_f16_e32 v0, v2, v0
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-UNSAFE-LABEL: test_half_add_mul_rhs:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -390,6 +574,59 @@ define half @test_half_add_mul_rhs(half %x, half %y, half %z) {
ret half %b
}
+define half @test_half_add_mul_rhs_contract(half %x, half %y, half %z) {
+; GFX9-LABEL: test_half_add_mul_rhs_contract:
+; GFX9: ; %bb.0: ; %.entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_half_add_mul_rhs_contract:
+; GFX9-CONTRACT: ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_half_add_mul_rhs_contract:
+; GFX9-DENORM: ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT: v_mad_legacy_f16 v0, v0, v1, v2
+; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_half_add_mul_rhs_contract:
+; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_half_add_mul_rhs_contract:
+; GFX10-CONTRACT: ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_half_add_mul_rhs_contract:
+; GFX10-DENORM: ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX10-DENORM-NEXT: v_add_f16_e32 v0, v2, v0
+; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_half_add_mul_rhs_contract:
+; GFX9-UNSAFE: ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_half_add_mul_rhs_contract:
+; GFX10-UNSAFE: ; %bb.0: ; %.entry
+; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-UNSAFE-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+.entry:
+ %a = fmul contract half %x, %y
+ %b = fadd contract half %z, %a
+ ret half %b
+}
+
define double @test_double_add_mul(double %x, double %y, double %z) {
; GFX9-LABEL: test_double_add_mul:
; GFX9: ; %bb.0: ; %.entry
@@ -411,12 +648,6 @@ define double @test_double_add_mul(double %x, double %y, double %z) {
; GFX9-DENORM-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5]
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-UNSAFE-LABEL: test_double_add_mul:
-; GFX9-UNSAFE: ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-LABEL: test_double_add_mul:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -436,15 +667,61 @@ define double @test_double_add_mul(double %x, double %y, double %z) {
; GFX10-DENORM-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
; GFX10-DENORM-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5]
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+.entry:
+ %a = fmul double %x, %y
+ %b = fadd double %a, %z
+ ret double %b
+}
+
+define double @test_double_add_mul_contract(double %x, double %y, double %z) {
+; GFX9-LABEL: test_double_add_mul_contract:
+; GFX9: ; %bb.0: ; %.entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_double_add_mul_contract:
+; GFX9-CONTRACT: ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_double_add_mul_contract:
+; GFX9-DENORM: ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
+; GFX10-LABEL: test_double_add_mul_contract:
+; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_double_add_mul_contract:
+; GFX10-CONTRACT: ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_double_add_mul_contract:
+; GFX10-DENORM: ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_double_add_mul_contract:
+; GFX9-UNSAFE: ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
; GFX10-UNSAFE-LABEL: test_double_add_mul:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31]
.entry:
- %a = fmul double %x, %y
- %b = fadd double %a, %z
+ %a = fmul contract double %x, %y
+ %b = fadd contract double %a, %z
ret double %b
}
@@ -469,12 +746,6 @@ define double @test_double_add_mul_rhs(double %x, double %y, double %z) {
; GFX9-DENORM-NEXT: v_add_f64 v[0:1], v[4:5], v[0:1]
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-UNSAFE-LABEL: test_double_add_mul_rhs:
-; GFX9-UNSAFE: ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-LABEL: test_double_add_mul_rhs:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -494,15 +765,61 @@ define double @test_double_add_mul_rhs(double %x, double %y, double %z) {
; GFX10-DENORM-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
; GFX10-DENORM-NEXT: v_add_f64 v[0:1], v[4:5], v[0:1]
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+.entry:
+ %a = fmul double %x, %y
+ %b = fadd double %z, %a
+ ret double %b
+}
+
+define double @test_double_add_mul_rhs_contract(double %x, double %y, double %z) {
+; GFX9-LABEL: test_double_add_mul_rhs_contract:
+; GFX9: ; %bb.0: ; %.entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_double_add_mul_rhs_contract:
+; GFX9-CONTRACT: ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_double_add_mul_rhs_contract:
+; GFX9-DENORM: ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-UNSAFE-LABEL: test_double_add_mul_rhs:
+; GFX10-LABEL: test_double_add_mul_rhs_contract:
+; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_double_add_mul_rhs_contract:
+; GFX10-CONTRACT: ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_double_add_mul_rhs_contract:
+; GFX10-DENORM: ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_double_add_mul_rhs_contract:
+; GFX9-UNSAFE: ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_double_add_mul_rhs_contract:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31]
.entry:
- %a = fmul double %x, %y
- %b = fadd double %z, %a
+ %a = fmul contract double %x, %y
+ %b = fadd contract double %z, %a
ret double %b
}
@@ -538,15 +855,6 @@ define <4 x float> @test_4xfloat_add_mul(<4 x float> %x, <4 x float> %y, <4 x fl
; GFX9-DENORM-NEXT: v_mad_f32 v3, v3, v7, v11
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-UNSAFE-LABEL: test_4xfloat_add_mul:
-; GFX9-UNSAFE: ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v4, v8
-; GFX9-UNSAFE-NEXT: v_fma_f32 v1, v1, v5, v9
-; GFX9-UNSAFE-NEXT: v_fma_f32 v2, v2, v6, v10
-; GFX9-UNSAFE-NEXT: v_fma_f32 v3, v3, v7, v11
-; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-LABEL: test_4xfloat_add_mul:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -577,8 +885,75 @@ define <4 x float> @test_4xfloat_add_mul(<4 x float> %x, <4 x float> %y, <4 x fl
; GFX10-DENORM-NEXT: v_mad_f32 v2, v2, v6, v10
; GFX10-DENORM-NEXT: v_mad_f32 v3, v3, v7, v11
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+.entry:
+ %a = fmul <4 x float> %x, %y
+ %b = fadd <4 x float> %a, %z
+ ret <4 x float> %b
+}
+
+define <4 x float> @test_4xfloat_add_mul_contract(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
+; GFX9-LABEL: test_4xfloat_add_mul_contract:
+; GFX9: ; %bb.0: ; %.entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_fma_f32 v0, v0, v4, v8
+; GFX9-NEXT: v_fma_f32 v1, v1, v5, v9
+; GFX9-NEXT: v_fma_f32 v2, v2, v6, v10
+; GFX9-NEXT: v_fma_f32 v3, v3, v7, v11
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_4xfloat_add_mul_contract:
+; GFX9-CONTRACT: ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v4, v8
+; GFX9-CONTRACT-NEXT: v_fma_f32 v1, v1, v5, v9
+; GFX9-CONTRACT-NEXT: v_fma_f32 v2, v2, v6, v10
+; GFX9-CONTRACT-NEXT: v_fma_f32 v3, v3, v7, v11
+; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_4xfloat_add_mul_contract:
+; GFX9-DENORM: ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v4, v8
+; GFX9-DENORM-NEXT: v_mad_f32 v1, v1, v5, v9
+; GFX9-DENORM-NEXT: v_mad_f32 v2, v2, v6, v10
+; GFX9-DENORM-NEXT: v_mad_f32 v3, v3, v7, v11
+; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-UNSAFE-LABEL: test_4xfloat_add_mul:
+; GFX10-LABEL: test_4xfloat_add_mul_contract:
+; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_fma_f32 v0, v0, v4, v8
+; GFX10-NEXT: v_fma_f32 v1, v1, v5, v9
+; GFX10-NEXT: v_fma_f32 v2, v2, v6, v10
+; GFX10-NEXT: v_fma_f32 v3, v3, v7, v11
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_4xfloat_add_mul_contract:
+; GFX10-CONTRACT: ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT: v_fma_f32 v0, v0, v4, v8
+; GFX10-CONTRACT-NEXT: v_fma_f32 v1, v1, v5, v9
+; GFX10-CONTRACT-NEXT: v_fma_f32 v2, v2, v6, v10
+; GFX10-CONTRACT-NEXT: v_fma_f32 v3, v3, v7, v11
+; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_4xfloat_add_mul_contract:
+; GFX10-DENORM: ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT: v_fma_f32 v0, v0, v4, v8
+; GFX10-DENORM-NEXT: v_fma_f32 v1, v1, v5, v9
+; GFX10-DENORM-NEXT: v_fma_f32 v2, v2, v6, v10
+; GFX10-DENORM-NEXT: v_fma_f32 v3, v3, v7, v11
+; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_4xfloat_add_mul_contract:
+; GFX9-UNSAFE: ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v4, v8
+; GFX9-UNSAFE-NEXT: v_fma_f32 v1, v1, v5, v9
+; GFX9-UNSAFE-NEXT: v_fma_f32 v2, v2, v6, v10
+; GFX9-UNSAFE-NEXT: v_fma_f32 v3, v3, v7, v11
+; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_4xfloat_add_mul_contract:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-UNSAFE-NEXT: v_fma_f32 v0, v0, v4, v8
@@ -587,8 +962,8 @@ define <4 x float> @test_4xfloat_add_mul(<4 x float> %x, <4 x float> %y, <4 x fl
; GFX10-UNSAFE-NEXT: v_fma_f32 v3, v3, v7, v11
; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31]
.entry:
- %a = fmul <4 x float> %x, %y
- %b = fadd <4 x float> %a, %z
+ %a = fmul contract <4 x float> %x, %y
+ %b = fadd contract <4 x float> %a, %z
ret <4 x float> %b
}
@@ -620,14 +995,6 @@ define <3 x float> @test_3xfloat_add_mul_rhs(<3 x float> %x, <3 x float> %y, <3
; GFX9-DENORM-NEXT: v_mad_f32 v2, v2, v5, v8
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-UNSAFE-LABEL: test_3xfloat_add_mul_rhs:
-; GFX9-UNSAFE: ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v3, v6
-; GFX9-UNSAFE-NEXT: v_fma_f32 v1, v1, v4, v7
-; GFX9-UNSAFE-NEXT: v_fma_f32 v2, v2, v5, v8
-; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-LABEL: test_3xfloat_add_mul_rhs:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -654,8 +1021,68 @@ define <3 x float> @test_3xfloat_add_mul_rhs(<3 x float> %x, <3 x float> %y, <3
; GFX10-DENORM-NEXT: v_mad_f32 v1, v1, v4, v7
; GFX10-DENORM-NEXT: v_mad_f32 v2, v2, v5, v8
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+.entry:
+ %a = fmul <3 x float> %x, %y
+ %b = fadd <3 x float> %z, %a
+ ret <3 x float> %b
+}
+
+define <3 x float> @test_3xfloat_add_mul_rhs_contract(<3 x float> %x, <3 x float> %y, <3 x float> %z) {
+; GFX9-LABEL: test_3xfloat_add_mul_rhs_contract:
+; GFX9: ; %bb.0: ; %.entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_fma_f32 v0, v0, v3, v6
+; GFX9-NEXT: v_fma_f32 v1, v1, v4, v7
+; GFX9-NEXT: v_fma_f32 v2, v2, v5, v8
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_3xfloat_add_mul_rhs_contract:
+; GFX9-CONTRACT: ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v3, v6
+; GFX9-CONTRACT-NEXT: v_fma_f32 v1, v1, v4, v7
+; GFX9-CONTRACT-NEXT: v_fma_f32 v2, v2, v5, v8
+; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-UNSAFE-LABEL: test_3xfloat_add_mul_rhs:
+; GFX9-DENORM-LABEL: test_3xfloat_add_mul_rhs_contract:
+; GFX9-DENORM: ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v3, v6
+; GFX9-DENORM-NEXT: v_mad_f32 v1, v1, v4, v7
+; GFX9-DENORM-NEXT: v_mad_f32 v2, v2, v5, v8
+; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_3xfloat_add_mul_rhs_contract:
+; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_fma_f32 v0, v0, v3, v6
+; GFX10-NEXT: v_fma_f32 v1, v1, v4, v7
+; GFX10-NEXT: v_fma_f32 v2, v2, v5, v8
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_3xfloat_add_mul_rhs_contract:
+; GFX10-CONTRACT: ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT: v_fma_f32 v0, v0, v3, v6
+; GFX10-CONTRACT-NEXT: v_fma_f32 v1, v1, v4, v7
+; GFX10-CONTRACT-NEXT: v_fma_f32 v2, v2, v5, v8
+; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_3xfloat_add_mul_rhs_contract:
+; GFX10-DENORM: ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT: v_fma_f32 v0, v0, v3, v6
+; GFX10-DENORM-NEXT: v_fma_f32 v1, v1, v4, v7
+; GFX10-DENORM-NEXT: v_fma_f32 v2, v2, v5, v8
+; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_3xfloat_add_mul_rhs_contract:
+; GFX9-UNSAFE: ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v3, v6
+; GFX9-UNSAFE-NEXT: v_fma_f32 v1, v1, v4, v7
+; GFX9-UNSAFE-NEXT: v_fma_f32 v2, v2, v5, v8
+; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_3xfloat_add_mul_rhs_contract:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-UNSAFE-NEXT: v_fma_f32 v0, v0, v3, v6
@@ -663,8 +1090,8 @@ define <3 x float> @test_3xfloat_add_mul_rhs(<3 x float> %x, <3 x float> %y, <3
; GFX10-UNSAFE-NEXT: v_fma_f32 v2, v2, v5, v8
; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31]
.entry:
- %a = fmul <3 x float> %x, %y
- %b = fadd <3 x float> %z, %a
+ %a = fmul contract <3 x float> %x, %y
+ %b = fadd contract <3 x float> %z, %a
ret <3 x float> %b
}
@@ -694,13 +1121,6 @@ define <4 x half> @test_4xhalf_add_mul(<4 x half> %x, <4 x half> %y, <4 x half>
; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v1, v5
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-UNSAFE-LABEL: test_4xhalf_add_mul:
-; GFX9-UNSAFE: ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4
-; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5
-; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-LABEL: test_4xhalf_add_mul:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -725,7 +1145,6 @@ define <4 x half> @test_4xhalf_add_mul(<4 x half> %x, <4 x half> %y, <4 x half>
; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v0, v4
; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v1, v5
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-UNSAFE-LABEL: test_4xhalf_add_mul:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -738,6 +1157,70 @@ define <4 x half> @test_4xhalf_add_mul(<4 x half> %x, <4 x half> %y, <4 x half>
ret <4 x half> %b
}
+define <4 x half> @test_4xhalf_add_mul_contract(<4 x half> %x, <4 x half> %y, <4 x half> %z) {
+; GFX9-LABEL: test_4xhalf_add_mul_contract:
+; GFX9: ; %bb.0: ; %.entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_4xhalf_add_mul_contract:
+; GFX9-CONTRACT: ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_4xhalf_add_mul_contract:
+; GFX9-DENORM: ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
+; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
+; GFX9-DENORM-NEXT: v_pk_add_f16 v0, v0, v4
+; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v1, v5
+; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_4xhalf_add_mul_contract:
+; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_4xhalf_add_mul_contract:
+; GFX10-CONTRACT: ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_4xhalf_add_mul_contract:
+; GFX10-DENORM: ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
+; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
+; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v0, v4
+; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v1, v5
+; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_4xhalf_add_mul_contract:
+; GFX9-UNSAFE: ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_4xhalf_add_mul_contract:
+; GFX10-UNSAFE: ; %bb.0: ; %.entry
+; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+.entry:
+ %a = fmul contract <4 x half> %x, %y
+ %b = fadd contract <4 x half> %a, %z
+ ret <4 x half> %b
+}
+
define <3 x half> @test_3xhalf_add_mul_rhs(<3 x half> %x, <3 x half> %y, <3 x half> %z) {
; GFX9-LABEL: test_3xhalf_add_mul_rhs:
; GFX9: ; %bb.0: ; %.entry
@@ -764,13 +1247,6 @@ define <3 x half> @test_3xhalf_add_mul_rhs(<3 x half> %x, <3 x half> %y, <3 x ha
; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v5, v1
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-UNSAFE-LABEL: test_3xhalf_add_mul_rhs:
-; GFX9-UNSAFE: ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4
-; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5
-; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-LABEL: test_3xhalf_add_mul_rhs:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -795,16 +1271,73 @@ define <3 x half> @test_3xhalf_add_mul_rhs(<3 x half> %x, <3 x half> %y, <3 x ha
; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v4, v0
; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v5, v1
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+.entry:
+ %a = fmul <3 x half> %x, %y
+ %b = fadd <3 x half> %z, %a
+ ret <3 x half> %b
+}
+
+define <3 x half> @test_3xhalf_add_mul_rhs_contract(<3 x half> %x, <3 x half> %y, <3 x half> %z) {
+; GFX9-LABEL: test_3xhalf_add_mul_rhs_contract:
+; GFX9: ; %bb.0: ; %.entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_3xhalf_add_mul_rhs_contract:
+; GFX9-CONTRACT: ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_3xhalf_add_mul_rhs_contract:
+; GFX9-DENORM: ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
+; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
+; GFX9-DENORM-NEXT: v_pk_add_f16 v0, v4, v0
+; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v5, v1
+; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_3xhalf_add_mul_rhs_contract:
+; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-UNSAFE-LABEL: test_3xhalf_add_mul_rhs:
+; GFX10-CONTRACT-LABEL: test_3xhalf_add_mul_rhs_contract:
+; GFX10-CONTRACT: ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_3xhalf_add_mul_rhs_contract:
+; GFX10-DENORM: ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
+; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
+; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v4, v0
+; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v5, v1
+; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_3xhalf_add_mul_rhs_contract:
+; GFX9-UNSAFE: ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_3xhalf_add_mul_rhs_contract:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4
; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5
; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31]
.entry:
- %a = fmul <3 x half> %x, %y
- %b = fadd <3 x half> %z, %a
+ %a = fmul contract <3 x half> %x, %y
+ %b = fadd contract <3 x half> %z, %a
ret <3 x half> %b
}
@@ -844,15 +1377,6 @@ define <4 x double> @test_4xdouble_add_mul(<4 x double> %x, <4 x double> %y, <4
; GFX9-DENORM-NEXT: v_add_f64 v[6:7], v[6:7], v[22:23]
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-UNSAFE-LABEL: test_4xdouble_add_mul:
-; GFX9-UNSAFE: ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
-; GFX9-UNSAFE-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
-; GFX9-UNSAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
-; GFX9-UNSAFE-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
-; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-LABEL: test_4xdouble_add_mul:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -887,7 +1411,14 @@ define <4 x double> @test_4xdouble_add_mul(<4 x double> %x, <4 x double> %y, <4
; GFX10-DENORM-NEXT: v_add_f64 v[4:5], v[4:5], v[20:21]
; GFX10-DENORM-NEXT: v_add_f64 v[6:7], v[6:7], v[22:23]
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
-;
+; GFX9-UNSAFE-LABEL: test_4xdouble_add_mul:
+; GFX9-UNSAFE: ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX9-UNSAFE-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX9-UNSAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX9-UNSAFE-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
; GFX10-UNSAFE-LABEL: test_4xdouble_add_mul:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -902,6 +1433,66 @@ define <4 x double> @test_4xdouble_add_mul(<4 x double> %x, <4 x double> %y, <4
ret <4 x double> %b
}
+define <4 x double> @test_4xdouble_add_mul_contract(<4 x double> %x, <4 x double> %y, <4 x double> %z) {
+; GFX9-LABEL: test_4xdouble_add_mul_contract:
+; GFX9: ; %bb.0: ; %.entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX9-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX9-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_4xdouble_add_mul_contract:
+; GFX9-CONTRACT: ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX9-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX9-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX9-CONTRACT-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_4xdouble_add_mul_contract:
+; GFX9-DENORM: ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX9-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX9-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX9-DENORM-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_4xdouble_add_mul_contract:
+; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX10-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_4xdouble_add_mul_contract:
+; GFX10-CONTRACT: ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX10-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX10-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX10-CONTRACT-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_4xdouble_add_mul_contract:
+; GFX10-DENORM: ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX10-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX10-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX10-DENORM-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+.entry:
+ %a = fmul contract <4 x double> %x, %y
+ %b = fadd contract <4 x double> %a, %z
+ ret <4 x double> %b
+}
+
define <3 x double> @test_3xdouble_add_mul_rhs(<3 x double> %x, <3 x double> %y, <3 x double> %z) {
; GFX9-LABEL: test_3xdouble_add_mul_rhs:
; GFX9: ; %bb.0: ; %.entry
@@ -933,14 +1524,6 @@ define <3 x double> @test_3xdouble_add_mul_rhs(<3 x double> %x, <3 x double> %y,
; GFX9-DENORM-NEXT: v_add_f64 v[4:5], v[16:17], v[4:5]
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-UNSAFE-LABEL: test_3xdouble_add_mul_rhs:
-; GFX9-UNSAFE: ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
-; GFX9-UNSAFE-NEXT: v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
-; GFX9-UNSAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
-; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-LABEL: test_3xdouble_add_mul_rhs:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -970,7 +1553,13 @@ define <3 x double> @test_3xdouble_add_mul_rhs(<3 x double> %x, <3 x double> %y,
; GFX10-DENORM-NEXT: v_add_f64 v[2:3], v[14:15], v[2:3]
; GFX10-DENORM-NEXT: v_add_f64 v[4:5], v[16:17], v[4:5]
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
-;
+; GFX9-UNSAFE-LABEL: test_3xdouble_add_mul_rhs:
+; GFX9-UNSAFE: ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
+; GFX9-UNSAFE-NEXT: v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
+; GFX9-UNSAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
; GFX10-UNSAFE-LABEL: test_3xdouble_add_mul_rhs:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -983,3 +1572,57 @@ define <3 x double> @test_3xdouble_add_mul_rhs(<3 x double> %x, <3 x double> %y,
%b = fadd <3 x double> %z, %a
ret <3 x double> %b
}
+
+define <3 x double> @test_3xdouble_add_mul_rhs_contract(<3 x double> %x, <3 x double> %y, <3 x double> %z) {
+; GFX9-LABEL: test_3xdouble_add_mul_rhs_contract:
+; GFX9: ; %bb.0: ; %.entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
+; GFX9-NEXT: v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
+; GFX9-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_3xdouble_add_mul_rhs_contract:
+; GFX9-CONTRACT: ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
+; GFX9-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
+; GFX9-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_3xdouble_add_mul_rhs_contract:
+; GFX9-DENORM: ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
+; GFX9-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
+; GFX9-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_3xdouble_add_mul_rhs_contract:
+; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
+; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
+; GFX10-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_3xdouble_add_mul_rhs_contract:
+; GFX10-CONTRACT: ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
+; GFX10-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
+; GFX10-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_3xdouble_add_mul_rhs_contract:
+; GFX10-DENORM: ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
+; GFX10-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
+; GFX10-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+.entry:
+ %a = fmul contract <3 x double> %x, %y
+ %b = fadd contract <3 x double> %z, %a
+ ret <3 x double> %b
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir
index 2845a63..d9ac9a7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir
@@ -24,8 +24,8 @@ body: |
%ptr:_(p1) = COPY $vgpr2_vgpr3
%vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
%el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
- %6:_(s32) = G_FMUL %0, %1
- %7:_(s32) = G_FADD %6, %el1
+ %6:_(s32) = contract G_FMUL %0, %1
+ %7:_(s32) = contract G_FADD %6, %el1
$vgpr0 = COPY %7(s32)
...
@@ -54,8 +54,8 @@ body: |
%ptr:_(p1) = COPY $vgpr2_vgpr3
%vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
%el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
- %6:_(s32) = G_FMUL %0, %1
- %7:_(s32) = G_FADD %el1, %6
+ %6:_(s32) = contract G_FMUL %0, %1
+ %7:_(s32) = contract G_FADD %el1, %6
$vgpr0 = COPY %7(s32)
...
@@ -233,10 +233,10 @@ body: |
%7:_(s16) = G_TRUNC %6(s32)
%8:_(s32) = COPY $vgpr5
%9:_(s16) = G_TRUNC %8(s32)
- %10:_(s16) = G_FMUL %7, %9
+ %10:_(s16) = contract G_FMUL %7, %9
%11:_(s32) = G_FPEXT %10(s16)
%12:_(s32) = G_FMA %0, %1, %11
- %13:_(s32) = G_FADD %12, %el1
+ %13:_(s32) = contract G_FADD %12, %el1
$vgpr0 = COPY %13(s32)
...
@@ -282,11 +282,11 @@ body: |
%9:_(s16) = G_TRUNC %8(s32)
%10:_(s32) = COPY $vgpr5
%11:_(s16) = G_TRUNC %10(s32)
- %12:_(s16) = G_FMUL %9, %11
- %13:_(s16) = G_FMUL %1, %3
- %14:_(s16) = G_FADD %13, %12
+ %12:_(s16) = contract G_FMUL %9, %11
+ %13:_(s16) = contract G_FMUL %1, %3
+ %14:_(s16) = contract G_FADD %13, %12
%15:_(s32) = G_FPEXT %14(s16)
- %16:_(s32) = G_FADD %15, %el1
+ %16:_(s32) = contract G_FADD %15, %el1
$vgpr0 = COPY %16(s32)
...
@@ -326,10 +326,10 @@ body: |
%7:_(s16) = G_TRUNC %6(s32)
%8:_(s32) = COPY $vgpr5
%9:_(s16) = G_TRUNC %8(s32)
- %10:_(s16) = G_FMUL %7, %9
+ %10:_(s16) = contract G_FMUL %7, %9
%11:_(s32) = G_FPEXT %10(s16)
%12:_(s32) = G_FMA %4, %5, %11
- %13:_(s32) = G_FADD %el1, %12
+ %13:_(s32) = contract G_FADD %el1, %12
$vgpr0 = COPY %13(s32)
...
@@ -375,11 +375,11 @@ body: |
%9:_(s16) = G_TRUNC %8(s32)
%10:_(s32) = COPY $vgpr5
%11:_(s16) = G_TRUNC %10(s32)
- %12:_(s16) = G_FMUL %9, %11
- %13:_(s16) = G_FMUL %5, %7
- %14:_(s16) = G_FADD %13, %12
+ %12:_(s16) = contract G_FMUL %9, %11
+ %13:_(s16) = contract G_FMUL %5, %7
+ %14:_(s16) = contract G_FADD %13, %12
%15:_(s32) = G_FPEXT %14(s16)
- %16:_(s32) = G_FADD %el1, %15
+ %16:_(s32) = contract G_FADD %el1, %15
$vgpr0 = COPY %16(s32)
...
@@ -409,8 +409,8 @@ body: |
%ptr:_(p1) = COPY $vgpr0_vgpr1
%vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
%el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
- %6:_(s32) = G_FMUL %0, %1
- %7:_(s32) = G_FSUB %6, %el1
+ %6:_(s32) = contract G_FMUL %0, %1
+ %7:_(s32) = contract G_FSUB %6, %el1
$vgpr0 = COPY %7(s32)
...
@@ -440,7 +440,7 @@ body: |
%ptr:_(p1) = COPY $vgpr2_vgpr3
%vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
%el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
- %6:_(s32) = G_FMUL %0, %1
- %7:_(s32) = G_FSUB %el1, %6
+ %6:_(s32) = contract G_FMUL %0, %1
+ %7:_(s32) = contract G_FSUB %el1, %6
$vgpr0 = COPY %7(s32)
...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
index c4d57ac..da25ac0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
@@ -12,7 +12,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da
; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX942-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr)
+ ; GFX942-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, !noalias.addrspace !0)
; GFX942-NEXT: S_ENDPGM 0
;
; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic
@@ -23,7 +23,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr)
+ ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, !noalias.addrspace !0)
; GFX11-NEXT: S_ENDPGM 0
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr %ptr, float %data)
ret void
@@ -38,7 +38,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data
; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX942-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr)
+ ; GFX942-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, !noalias.addrspace !0)
; GFX942-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
; GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
@@ -50,7 +50,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr)
+ ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, !noalias.addrspace !0)
; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr %ptr, float %data)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll
index c82ae2fb..bf36979 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll
@@ -13,7 +13,7 @@ define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %d
; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: FLAT_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr)
+ ; GFX90A_GFX942-NEXT: FLAT_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, !noalias.addrspace !0)
; GFX90A_GFX942-NEXT: S_ENDPGM 0
%ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
ret void
@@ -30,7 +30,7 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %da
; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr)
+ ; GFX90A_GFX942-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, !noalias.addrspace !0)
; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0
; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 8192d4a..0e132f1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -70,12 +70,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
@@ -85,12 +85,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -153,12 +153,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
@@ -168,12 +168,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -236,12 +236,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
@@ -251,12 +251,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -319,12 +319,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
@@ -334,12 +334,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -402,12 +402,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
@@ -417,12 +417,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -485,12 +485,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
@@ -500,12 +500,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -568,12 +568,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
@@ -583,12 +583,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -651,12 +651,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
@@ -666,12 +666,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -734,12 +734,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
@@ -749,12 +749,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -817,12 +817,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
@@ -832,12 +832,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -900,12 +900,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
@@ -915,12 +915,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -983,12 +983,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
@@ -998,12 +998,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir
index f513de8..477ef32 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir
@@ -385,117 +385,16 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
- ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64)
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2047
- ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1008
- ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND]], [[C2]]
- ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
- ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C3]](s32)
- ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4094
- ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C4]]
- ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 511
- ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV3]], [[C5]]
- ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[UV2]]
- ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR]](s32), [[C6]]
- ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
- ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[ZEXT]]
- ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 512
- ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR1]](s32), [[C6]]
- ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C7]], [[C6]]
- ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 31744
- ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SELECT]], [[C8]]
- ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
- ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ADD]], [[C9]](s32)
- ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL]]
- ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
- ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C10]], [[ADD]]
- ; CHECK-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[C6]]
- ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 13
- ; CHECK-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SMAX]], [[C11]]
- ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 4096
- ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[C12]]
- ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[OR4]], [[SMIN]](s32)
- ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR2]], [[SMIN]](s32)
- ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL1]](s32), [[OR4]]
- ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP2]](s1)
- ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[ZEXT1]]
- ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[ADD]](s32), [[C10]]
- ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[OR5]], [[OR3]]
- ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
- ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C13]]
- ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
- ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[SELECT1]], [[C14]](s32)
- ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
- ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND3]](s32), [[C15]]
- ; CHECK-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP4]](s1)
- ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
- ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND3]](s32), [[C16]]
- ; CHECK-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP5]](s1)
- ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[ZEXT3]]
- ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR3]], [[OR6]]
- ; CHECK-NEXT: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 30
- ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[ADD]](s32), [[C17]]
- ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP6]](s1), [[C8]], [[ADD1]]
- ; CHECK-NEXT: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 1039
- ; CHECK-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C18]]
- ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP7]](s1), [[OR2]], [[SELECT2]]
- ; CHECK-NEXT: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; CHECK-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C19]](s32)
- ; CHECK-NEXT: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 32768
- ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C20]]
- ; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SELECT3]]
- ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
- ; CHECK-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C]](s32)
- ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR5]], [[C1]]
- ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[AND5]], [[C2]]
- ; CHECK-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C3]](s32)
- ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C4]]
- ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C5]]
- ; CHECK-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[UV4]]
- ; CHECK-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR8]](s32), [[C6]]
- ; CHECK-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP8]](s1)
- ; CHECK-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[ZEXT4]]
- ; CHECK-NEXT: [[ICMP9:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR9]](s32), [[C6]]
- ; CHECK-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP9]](s1), [[C7]], [[C6]]
- ; CHECK-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SELECT4]], [[C8]]
- ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ADD2]], [[C9]](s32)
- ; CHECK-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL2]]
- ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C10]], [[ADD2]]
- ; CHECK-NEXT: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[C6]]
- ; CHECK-NEXT: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[C11]]
- ; CHECK-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[C12]]
- ; CHECK-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[OR12]], [[SMIN1]](s32)
- ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LSHR7]], [[SMIN1]](s32)
- ; CHECK-NEXT: [[ICMP10:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL3]](s32), [[OR12]]
- ; CHECK-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP10]](s1)
- ; CHECK-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[LSHR7]], [[ZEXT5]]
- ; CHECK-NEXT: [[ICMP11:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[ADD2]](s32), [[C10]]
- ; CHECK-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP11]](s1), [[OR13]], [[OR11]]
- ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[SELECT5]], [[C13]]
- ; CHECK-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[SELECT5]], [[C14]](s32)
- ; CHECK-NEXT: [[ICMP12:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND8]](s32), [[C15]]
- ; CHECK-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP12]](s1)
- ; CHECK-NEXT: [[ICMP13:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND8]](s32), [[C16]]
- ; CHECK-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP13]](s1)
- ; CHECK-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[ZEXT7]]
- ; CHECK-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[LSHR8]], [[OR14]]
- ; CHECK-NEXT: [[ICMP14:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[ADD2]](s32), [[C17]]
- ; CHECK-NEXT: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[ICMP14]](s1), [[C8]], [[ADD3]]
- ; CHECK-NEXT: [[ICMP15:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ADD2]](s32), [[C18]]
- ; CHECK-NEXT: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[ICMP15]](s1), [[OR10]], [[SELECT6]]
- ; CHECK-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C19]](s32)
- ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[LSHR9]], [[C20]]
- ; CHECK-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[AND9]], [[SELECT7]]
- ; CHECK-NEXT: [[C21:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
- ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[OR7]], [[C21]]
- ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[OR15]], [[C21]]
- ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C19]](s32)
- ; CHECK-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL4]]
- ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR16]](s32)
+ ; CHECK-NEXT: [[FPTRUNC:%[0-9]+]]:_(s32) = afn G_FPTRUNC [[UV]](s64)
+ ; CHECK-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = afn G_FPTRUNC [[FPTRUNC]](s32)
+ ; CHECK-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s32) = afn G_FPTRUNC [[UV1]](s64)
+ ; CHECK-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = afn G_FPTRUNC [[FPTRUNC2]](s32)
+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16)
+ ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC3]](s16)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
+ ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
+ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
; CHECK-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
%0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
%1:_(<2 x s16>) = afn G_FPTRUNC %0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll
index 696fd57..18ec3ab 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll
@@ -14,8 +14,7 @@ define internal fastcc void @foo(ptr %kg) {
; CHECK-NEXT: [[NUM_CLOSURE_I26_I:%.*]] = getelementptr i8, ptr [[KG]], i64 276
; CHECK-NEXT: br label %[[WHILE_COND:.*]]
; CHECK: [[WHILE_COND]]:
-; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[KG]] to ptr addrspace(5)
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4, !noalias.addrspace [[META0:![0-9]+]]
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[KG]], align 4
; CHECK-NEXT: [[IDXPROM_I:%.*]] = zext i32 [[TMP1]] to i64
; CHECK-NEXT: switch i32 0, label %[[SW_BB92:.*]] [
; CHECK-NEXT: i32 1, label %[[SW_BB92]]
@@ -23,22 +22,18 @@ define internal fastcc void @foo(ptr %kg) {
; CHECK-NEXT: ]
; CHECK: [[SUBD_TRIANGLE_PATCH_EXIT_I_I35]]:
; CHECK-NEXT: [[ARRAYIDX_I27_I:%.*]] = getelementptr float, ptr [[KG]], i64 [[IDXPROM_I]]
-; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[ARRAYIDX_I27_I]] to ptr addrspace(5)
-; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[TMP2]], align 4, !noalias.addrspace [[META0]]
+; CHECK-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX_I27_I]], align 4
; CHECK-NEXT: br label %[[WHILE_COND]]
; CHECK: [[SW_BB92]]:
; CHECK-NEXT: [[INSERT:%.*]] = insertelement <3 x i32> zeroinitializer, i32 [[TMP1]], i64 0
; CHECK-NEXT: [[SPLAT_SPLATINSERT_I:%.*]] = bitcast <3 x i32> [[INSERT]] to <3 x float>
; CHECK-NEXT: [[SHFL:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT_I]], <3 x float> zeroinitializer, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[NUM_CLOSURE_I26_I]] to ptr addrspace(5)
-; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4, !noalias.addrspace [[META0]]
+; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[NUM_CLOSURE_I26_I]], align 4
; CHECK-NEXT: [[IDXPROM_I27_I:%.*]] = sext i32 [[LOAD]] to i64
; CHECK-NEXT: [[ARRAYIDX_I28_I:%.*]] = getelementptr [64 x %struct.ShaderClosure], ptr [[CLOSURE_I25_I]], i64 0, i64 [[IDXPROM_I27_I]]
-; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[ARRAYIDX_I28_I]] to ptr addrspace(5)
-; CHECK-NEXT: store <4 x float> [[SHFL]], ptr addrspace(5) [[TMP4]], align 16, !noalias.addrspace [[META0]]
+; CHECK-NEXT: store <4 x float> [[SHFL]], ptr [[ARRAYIDX_I28_I]], align 16
; CHECK-NEXT: [[INC_I30_I:%.*]] = or i32 [[LOAD]], 1
-; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr [[NUM_CLOSURE_I26_I]] to ptr addrspace(5)
-; CHECK-NEXT: store i32 [[INC_I30_I]], ptr addrspace(5) [[TMP5]], align 4, !noalias.addrspace [[META0]]
+; CHECK-NEXT: store i32 [[INC_I30_I]], ptr [[NUM_CLOSURE_I26_I]], align 4
; CHECK-NEXT: br label %[[WHILE_COND]]
;
entry:
@@ -93,6 +88,3 @@ entry:
}
attributes #0 = { norecurse }
-;.
-; CHECK: [[META0]] = !{i32 1, i32 5, i32 6, i32 10}
-;.
diff --git a/llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir b/llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir
index 1a457c9..9241a23 100644
--- a/llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir
+++ b/llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir
@@ -38,20 +38,20 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: early-clobber renamable $sgpr6_sgpr7 = S_LOAD_DWORDX2_IMM_ec renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
; CHECK-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 0, 0 :: ("amdgpu-noclobber" load (s128), addrspace 1)
- ; CHECK-NEXT: renamable $vgpr4 = V_MOV_B32_e32 1065353216, implicit $exec
- ; CHECK-NEXT: renamable $vgpr5 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: renamable $vgpr6 = V_MOV_B32_e32 1073741824, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1065353216, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr4 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr1 = V_MOV_B32_e32 1073741824, implicit $exec
; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr4, $vgpr6, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: renamable $vgpr1 = COPY renamable $agpr1
- ; CHECK-NEXT: renamable $vgpr0 = COPY renamable $agpr0
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr4, $vgpr6, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: renamable $vgpr3 = COPY renamable $agpr1
- ; CHECK-NEXT: renamable $vgpr2 = COPY killed renamable $agpr0
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr4, killed $vgpr6, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr0, $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr6 = COPY renamable $agpr1
+ ; CHECK-NEXT: renamable $vgpr5 = COPY renamable $agpr0
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr0, $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr8 = COPY renamable $agpr1
+ ; CHECK-NEXT: renamable $vgpr7 = COPY killed renamable $agpr0
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr5_vgpr6_vgpr7_vgpr8
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr5, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr6_sgpr7, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr4, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr6_sgpr7, 0, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr6_sgpr7 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 0, 0 :: ("amdgpu-noclobber" load (s128), addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-math.ll b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
index 1adf542..9979e83 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-math.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
@@ -323,6 +323,146 @@ define amdgpu_ps void @v_test_max_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat>
ret void
}
+define amdgpu_ps bfloat @test_clamp_bf16(bfloat %src) {
+; GCN-LABEL: test_clamp_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
+; GCN-NEXT: ; return to shader part epilog
+ %max = call bfloat @llvm.maxnum.bf16(bfloat %src, bfloat 0.0)
+ %clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
+ ret bfloat %clamp
+}
+
+define amdgpu_ps bfloat @test_clamp_bf16_s(bfloat inreg %src) {
+; GCN-LABEL: test_clamp_bf16_s:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_max_num_bf16 v0, s0, s0 clamp
+; GCN-NEXT: ; return to shader part epilog
+ %max = call bfloat @llvm.maxnum.bf16(bfloat %src, bfloat 0.0)
+ %clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
+ ret bfloat %clamp
+}
+
+define amdgpu_ps float @test_clamp_v2bf16(<2 x bfloat> %src) {
+; GCN-LABEL: test_clamp_v2bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
+; GCN-NEXT: ; return to shader part epilog
+ %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %src, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
+ %clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> <bfloat 1.0, bfloat 1.0>)
+ %ret = bitcast <2 x bfloat> %clamp to float
+ ret float %ret
+}
+
+define amdgpu_ps float @test_clamp_v2bf16_s(<2 x bfloat> inreg %src) {
+; GCN-LABEL: test_clamp_v2bf16_s:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_max_num_bf16 v0, s0, s0 clamp
+; GCN-NEXT: ; return to shader part epilog
+ %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %src, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
+ %clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> <bfloat 1.0, bfloat 1.0>)
+ %ret = bitcast <2 x bfloat> %clamp to float
+ ret float %ret
+}
+
+define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) {
+; GCN-LABEL: test_clamp_bf16_folding:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_exp_bf16_e32 v0, v0
+; GCN-NEXT: v_nop
+; GCN-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
+; GCN-NEXT: ; return to shader part epilog
+ %exp = call bfloat @llvm.exp2.bf16(bfloat %src)
+ %max = call bfloat @llvm.maxnum.bf16(bfloat %exp, bfloat 0.0)
+ %clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
+ ret bfloat %clamp
+}
+
+define amdgpu_ps float @test_clamp_v2bf16_folding(<2 x bfloat> %src0, <2 x bfloat> %src1) {
+; GCN-LABEL: test_clamp_v2bf16_folding:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_mul_bf16 v0, v0, v1
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
+; GCN-NEXT: ; return to shader part epilog
+ %mul = fmul <2 x bfloat> %src0, %src1
+ %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %mul, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
+ %clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> <bfloat 1.0, bfloat 1.0>)
+ %ret = bitcast <2 x bfloat> %clamp to float
+ ret float %ret
+}
+
+define amdgpu_ps void @v_test_mul_add_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
+; GCN-LABEL: v_test_mul_add_v2bf16_vvv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_mul_bf16 v2, v2, v3
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_pk_add_bf16 v2, v2, v4
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %mul = fmul contract <2 x bfloat> %a, %b
+ %add = fadd contract <2 x bfloat> %mul, %c
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_mul_add_v2bf16_vss(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
+; GCN-LABEL: v_test_mul_add_v2bf16_vss:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_mul_bf16 v2, v2, s0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_pk_add_bf16 v2, v2, s1
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %mul = fmul contract <2 x bfloat> %a, %b
+ %add = fadd contract <2 x bfloat> %mul, %c
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_mul_add_v2bf16_sss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
+; GCN-LABEL: v_test_mul_add_v2bf16_sss:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_mul_bf16 v2, s0, s1
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_pk_add_bf16 v2, v2, s2
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %mul = fmul contract <2 x bfloat> %a, %b
+ %add = fadd contract <2 x bfloat> %mul, %c
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_mul_add_v2bf16_vsc(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
+; GCN-LABEL: v_test_mul_add_v2bf16_vsc:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_mul_bf16 v2, v2, s0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_pk_add_bf16 v2, v2, 0.5 op_sel_hi:[1,0]
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %mul = fmul contract <2 x bfloat> %a, %b
+ %add = fadd contract <2 x bfloat> %mul, <bfloat 0.5, bfloat 0.5>
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_test_mul_add_v2bf16_vll(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_mul_add_v2bf16_vll:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_mul_bf16 v2, 0x42c83f80, v2
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_pk_add_bf16 v2, 0x43484000, v2
+; GCN-NEXT: global_store_b32 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %mul = fmul contract <2 x bfloat> %a, <bfloat 1.0, bfloat 100.0>
+ %add = fadd contract <2 x bfloat> %mul, <bfloat 2.0, bfloat 200.0>
+ store <2 x bfloat> %add, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_ps void @v_test_fma_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
; GCN-LABEL: v_test_fma_v2bf16_vvv:
; GCN: ; %bb.0:
@@ -426,6 +566,8 @@ define amdgpu_ps void @llvm_exp2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src
ret void
}
+declare bfloat @llvm.minnum.bf16(bfloat, bfloat)
+declare bfloat @llvm.maxnum.bf16(bfloat, bfloat)
declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 7859fcdf..52e697c 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -468,15 +468,28 @@ define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_load_global_v16bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v9, v1
-; GFX9-NEXT: v_mov_b32_e32 v8, v0
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[8:9], off
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_load_global_v16bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v9, v1
+; GFX900-NEXT: v_mov_b32_e32 v8, v0
+; GFX900-NEXT: global_load_dwordx4 v[0:3], v[8:9], off
+; GFX900-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_load_global_v16bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: global_load_dwordx4 v[8:11], v[0:1], off
+; GFX950-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX950-NEXT: s_waitcnt vmcnt(1)
+; GFX950-NEXT: v_mov_b32_e32 v0, v8
+; GFX950-NEXT: v_mov_b32_e32 v1, v9
+; GFX950-NEXT: v_mov_b32_e32 v2, v10
+; GFX950-NEXT: v_mov_b32_e32 v3, v11
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_load_global_v16bf16:
; GFX10: ; %bb.0:
@@ -619,17 +632,32 @@ define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_load_global_v32bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v17, v1
-; GFX9-NEXT: v_mov_b32_e32 v16, v0
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[16:17], off
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v[16:17], off offset:16
-; GFX9-NEXT: global_load_dwordx4 v[8:11], v[16:17], off offset:32
-; GFX9-NEXT: global_load_dwordx4 v[12:15], v[16:17], off offset:48
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_load_global_v32bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v17, v1
+; GFX900-NEXT: v_mov_b32_e32 v16, v0
+; GFX900-NEXT: global_load_dwordx4 v[0:3], v[16:17], off
+; GFX900-NEXT: global_load_dwordx4 v[4:7], v[16:17], off offset:16
+; GFX900-NEXT: global_load_dwordx4 v[8:11], v[16:17], off offset:32
+; GFX900-NEXT: global_load_dwordx4 v[12:15], v[16:17], off offset:48
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_load_global_v32bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: global_load_dwordx4 v[16:19], v[0:1], off
+; GFX950-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX950-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32
+; GFX950-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48
+; GFX950-NEXT: s_waitcnt vmcnt(3)
+; GFX950-NEXT: v_mov_b32_e32 v0, v16
+; GFX950-NEXT: v_mov_b32_e32 v1, v17
+; GFX950-NEXT: v_mov_b32_e32 v2, v18
+; GFX950-NEXT: v_mov_b32_e32 v3, v19
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_load_global_v32bf16:
; GFX10: ; %bb.0:
@@ -877,22 +905,41 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_load_global_v64bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v29, v1
-; GFX9-NEXT: v_mov_b32_e32 v28, v0
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[28:29], off
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v[28:29], off offset:16
-; GFX9-NEXT: global_load_dwordx4 v[8:11], v[28:29], off offset:32
-; GFX9-NEXT: global_load_dwordx4 v[12:15], v[28:29], off offset:48
-; GFX9-NEXT: global_load_dwordx4 v[16:19], v[28:29], off offset:64
-; GFX9-NEXT: global_load_dwordx4 v[20:23], v[28:29], off offset:80
-; GFX9-NEXT: global_load_dwordx4 v[24:27], v[28:29], off offset:96
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: global_load_dwordx4 v[28:31], v[28:29], off offset:112
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_load_global_v64bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v29, v1
+; GFX900-NEXT: v_mov_b32_e32 v28, v0
+; GFX900-NEXT: global_load_dwordx4 v[0:3], v[28:29], off
+; GFX900-NEXT: global_load_dwordx4 v[4:7], v[28:29], off offset:16
+; GFX900-NEXT: global_load_dwordx4 v[8:11], v[28:29], off offset:32
+; GFX900-NEXT: global_load_dwordx4 v[12:15], v[28:29], off offset:48
+; GFX900-NEXT: global_load_dwordx4 v[16:19], v[28:29], off offset:64
+; GFX900-NEXT: global_load_dwordx4 v[20:23], v[28:29], off offset:80
+; GFX900-NEXT: global_load_dwordx4 v[24:27], v[28:29], off offset:96
+; GFX900-NEXT: s_nop 0
+; GFX900-NEXT: global_load_dwordx4 v[28:31], v[28:29], off offset:112
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_load_global_v64bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: global_load_dwordx4 v[32:35], v[0:1], off
+; GFX950-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX950-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32
+; GFX950-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48
+; GFX950-NEXT: global_load_dwordx4 v[16:19], v[0:1], off offset:64
+; GFX950-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:80
+; GFX950-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:96
+; GFX950-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:112
+; GFX950-NEXT: s_waitcnt vmcnt(7)
+; GFX950-NEXT: v_mov_b32_e32 v0, v32
+; GFX950-NEXT: v_mov_b32_e32 v1, v33
+; GFX950-NEXT: v_mov_b32_e32 v2, v34
+; GFX950-NEXT: v_mov_b32_e32 v3, v35
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_load_global_v64bf16:
; GFX10: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 7208eae..763f436 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -51,11 +51,11 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) {
; GFX942-LABEL: build_vector2:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, 5
-; GFX942-NEXT: v_mov_b32_e32 v1, 6
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 5
+; GFX942-NEXT: v_mov_b32_e32 v3, 6
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
; GFX942-NEXT: s_endpgm
entry:
store <2 x i32> <i32 5, i32 6>, ptr addrspace(1) %out
@@ -116,13 +116,13 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) {
; GFX942-LABEL: build_vector4:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, 5
-; GFX942-NEXT: v_mov_b32_e32 v1, 6
-; GFX942-NEXT: v_mov_b32_e32 v2, 7
-; GFX942-NEXT: v_mov_b32_e32 v3, 8
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 5
+; GFX942-NEXT: v_mov_b32_e32 v3, 6
+; GFX942-NEXT: v_mov_b32_e32 v4, 7
+; GFX942-NEXT: v_mov_b32_e32 v5, 8
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
; GFX942-NEXT: s_endpgm
entry:
store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, ptr addrspace(1) %out
@@ -307,13 +307,13 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
; GFX942-LABEL: build_v2i32_from_v4i16_shuffle:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_lshl_b32 s3, s3, 16
; GFX942-NEXT: s_lshl_b32 s2, s2, 16
-; GFX942-NEXT: v_mov_b32_e32 v0, s2
-; GFX942-NEXT: v_mov_b32_e32 v1, s3
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, s2
+; GFX942-NEXT: v_mov_b32_e32 v3, s3
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
; GFX942-NEXT: s_endpgm
entry:
%shuf = shufflevector <4 x i16> %in, <4 x i16> zeroinitializer, <2 x i32> <i32 0, i32 2>
diff --git a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
index 20f48da..c126f9e 100644
--- a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
@@ -15,7 +15,7 @@ define internal void @direct() {
; CHECK-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5)
; CHECK-NEXT: store ptr @indirect, ptr addrspace(5) [[FPTR]], align 8
; CHECK-NEXT: [[FP:%.*]] = load ptr, ptr addrspace(5) [[FPTR]], align 8
-; CHECK-NEXT: call void @indirect()
+; CHECK-NEXT: call void [[FP]]()
; CHECK-NEXT: ret void
;
%fptr = alloca ptr, addrspace(5)
@@ -36,5 +36,5 @@ define amdgpu_kernel void @test_direct_indirect_call() {
}
;.
; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
index fa25f09..d646460 100644
--- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
+++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
@@ -15,7 +15,7 @@ define amdgpu_kernel void @test_simple_indirect_call() #0 {
; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5)
; ATTRIBUTOR_GCN-NEXT: store ptr @indirect, ptr addrspace(5) [[FPTR]], align 8
; ATTRIBUTOR_GCN-NEXT: [[FP:%.*]] = load ptr, ptr addrspace(5) [[FPTR]], align 8
-; ATTRIBUTOR_GCN-NEXT: call void @indirect()
+; ATTRIBUTOR_GCN-NEXT: call void [[FP]]()
; ATTRIBUTOR_GCN-NEXT: ret void
;
%fptr = alloca ptr, addrspace(5)
@@ -29,5 +29,5 @@ attributes #0 = { "amdgpu-no-dispatch-id" }
;.
; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-dispatch-id" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll b/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll
index 8781196..4f752d1 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll
@@ -8,10 +8,10 @@ define amdgpu_ps void @global_load_lds_dword_saddr(ptr addrspace(1) inreg nocapt
; GFX942-LABEL: global_load_lds_dword_saddr:
; GFX942: ; %bb.0: ; %main_body
; GFX942-NEXT: v_readfirstlane_b32 s2, v0
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: s_mov_b32 m0, s2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: global_load_lds_dword v2, s[0:1] offset:32 nt
+; GFX942-NEXT: global_load_lds_dword v1, s[0:1] offset:32 nt
; GFX942-NEXT: s_getpc_b64 s[0:1]
; GFX942-NEXT: s_add_u32 s0, s0, G@gotpcrel32@lo+4
; GFX942-NEXT: s_addc_u32 s1, s1, G@gotpcrel32@hi+12
@@ -21,9 +21,9 @@ define amdgpu_ps void @global_load_lds_dword_saddr(ptr addrspace(1) inreg nocapt
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_mul_i32 s3, s3, 10
; GFX942-NEXT: s_mul_i32 s2, s2, 10
-; GFX942-NEXT: v_mov_b32_e32 v0, s2
-; GFX942-NEXT: v_mov_b32_e32 v1, s3
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, s2
+; GFX942-NEXT: v_mov_b32_e32 v3, s3
+; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX90A-LABEL: global_load_lds_dword_saddr:
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index 873fcee..6067194 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -71,12 +71,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
@@ -86,12 +86,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -154,12 +154,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
@@ -169,12 +169,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -237,12 +237,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
@@ -252,12 +252,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -320,12 +320,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
@@ -335,12 +335,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -403,12 +403,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
@@ -418,12 +418,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -486,12 +486,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
@@ -501,12 +501,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -569,12 +569,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
@@ -584,12 +584,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -652,12 +652,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
@@ -667,12 +667,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -735,12 +735,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
@@ -750,12 +750,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -818,12 +818,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
@@ -833,12 +833,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -901,12 +901,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
@@ -916,12 +916,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -984,12 +984,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s10
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
@@ -999,12 +999,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index d0b41e1..57b4857 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -1,16 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -global-isel=0 -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=SI-SDAG %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -global-isel=1 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI-GISEL %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=VI-SDAG %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -global-isel=1 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=VI-GISEL %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-SDAG %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-GISEL %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-SDAG %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-GISEL %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SDAG-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SDAG-FAKE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -global-isel=0 < %s | FileCheck -enable-var-scope -check-prefixes=SI-SDAG %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -global-isel=1 < %s | FileCheck -check-prefixes=SI-GISEL %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=VI-SDAG %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=VI-GISEL %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-SDAG %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-GISEL %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-SDAG %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-GISEL %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SDAG-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SDAG-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-FAKE16 %s
define amdgpu_kernel void @fptrunc_f32_to_f16(
; SI-SDAG-LABEL: fptrunc_f32_to_f16:
@@ -201,8 +201,8 @@ entry:
ret void
}
-define amdgpu_kernel void @fptrunc_f64_to_f16(
-; SI-SDAG-LABEL: fptrunc_f64_to_f16:
+define amdgpu_kernel void @fptrunc_f32_to_f16_afn(ptr addrspace(1) %r,
+; SI-SDAG-LABEL: fptrunc_f32_to_f16_afn:
; SI-SDAG: ; %bb.0: ; %entry
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000
@@ -212,29 +212,27 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SI-SDAG-NEXT: s_mov_b32 s8, s2
; SI-SDAG-NEXT: s_mov_b32 s9, s3
-; SI-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SI-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
; SI-SDAG-NEXT: s_mov_b32 s4, s0
; SI-SDAG-NEXT: s_mov_b32 s5, s1
; SI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
; SI-SDAG-NEXT: s_endpgm
;
-; SI-GISEL-LABEL: fptrunc_f64_to_f16:
+; SI-GISEL-LABEL: fptrunc_f32_to_f16_afn:
; SI-GISEL: ; %bb.0: ; %entry
; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0
; SI-GISEL-NEXT: s_mov_b32 s2, -1
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s3
; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000
; SI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-GISEL-NEXT: s_endpgm
;
-; VI-SDAG-LABEL: fptrunc_f64_to_f16:
+; VI-SDAG-LABEL: fptrunc_f32_to_f16_afn:
; VI-SDAG: ; %bb.0: ; %entry
; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
@@ -244,29 +242,27 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: s_mov_b32 s8, s2
; VI-SDAG-NEXT: s_mov_b32 s9, s3
-; VI-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; VI-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
; VI-SDAG-NEXT: s_mov_b32 s4, s0
; VI-SDAG-NEXT: s_mov_b32 s5, s1
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-SDAG-NEXT: s_endpgm
;
-; VI-GISEL-LABEL: fptrunc_f64_to_f16:
+; VI-GISEL-LABEL: fptrunc_f32_to_f16_afn:
; VI-GISEL: ; %bb.0: ; %entry
; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
-; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-GISEL-NEXT: s_endpgm
;
-; GFX9-SDAG-LABEL: fptrunc_f64_to_f16:
+; GFX9-SDAG-LABEL: fptrunc_f32_to_f16_afn:
; GFX9-SDAG: ; %bb.0: ; %entry
; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
@@ -276,29 +272,27 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX9-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX9-SDAG-NEXT: s_endpgm
;
-; GFX9-GISEL-LABEL: fptrunc_f64_to_f16:
+; GFX9-GISEL-LABEL: fptrunc_f32_to_f16_afn:
; GFX9-GISEL: ; %bb.0: ; %entry
; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-GISEL-NEXT: s_endpgm
;
-; GFX950-SDAG-LABEL: fptrunc_f64_to_f16:
+; GFX950-SDAG-LABEL: fptrunc_f32_to_f16_afn:
; GFX950-SDAG: ; %bb.0: ; %entry
; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX950-SDAG-NEXT: s_mov_b32 s7, 0xf000
@@ -308,23 +302,541 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_mov_b32 s8, s2
; GFX950-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX950-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; GFX950-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
; GFX950-SDAG-NEXT: s_mov_b32 s4, s0
; GFX950-SDAG-NEXT: s_mov_b32 s5, s1
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX950-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX950-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX950-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX950-SDAG-NEXT: s_endpgm
;
+; GFX950-GISEL-LABEL: fptrunc_f32_to_f16_afn:
+; GFX950-GISEL: ; %bb.0: ; %entry
+; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX950-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
+; GFX950-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX950-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-TRUE16-LABEL: fptrunc_f32_to_f16_afn:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-SDAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: fptrunc_f32_to_f16_afn:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-SDAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-FAKE16-NEXT: s_endpgm
+;
+; GFX11-GISEL-TRUE16-LABEL: fptrunc_f32_to_f16_afn:
+; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, s2
+; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX11-GISEL-FAKE16-LABEL: fptrunc_f32_to_f16_afn:
+; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, s2
+; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-FAKE16-NEXT: s_endpgm
+ ptr addrspace(1) %a) {
+entry:
+ %a.val = load float, ptr addrspace(1) %a
+ %r.val = fptrunc afn float %a.val to half
+ store half %r.val, ptr addrspace(1) %r
+ ret void
+}
+
+define amdgpu_kernel void @fptrunc_f64_to_f16(
+; SI-SDAG-LABEL: fptrunc_f64_to_f16:
+; SI-SDAG: ; %bb.0: ; %entry
+; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT: s_mov_b32 s2, -1
+; SI-SDAG-NEXT: s_mov_b32 s10, s2
+; SI-SDAG-NEXT: s_mov_b32 s11, s3
+; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT: s_mov_b32 s8, s6
+; SI-SDAG-NEXT: s_mov_b32 s9, s7
+; SI-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SI-SDAG-NEXT: s_movk_i32 s0, 0x7e00
+; SI-SDAG-NEXT: s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT: v_readfirstlane_b32 s1, v1
+; SI-SDAG-NEXT: s_and_b32 s6, s1, 0x1ff
+; SI-SDAG-NEXT: s_lshr_b32 s7, s1, 8
+; SI-SDAG-NEXT: s_bfe_u32 s8, s1, 0xb0014
+; SI-SDAG-NEXT: v_or_b32_e32 v0, s6, v0
+; SI-SDAG-NEXT: s_and_b32 s6, s7, 0xffe
+; SI-SDAG-NEXT: s_sub_i32 s7, 0x3f1, s8
+; SI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-SDAG-NEXT: v_med3_i32 v1, s7, 0, 13
+; SI-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; SI-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; SI-SDAG-NEXT: s_or_b32 s6, s6, s7
+; SI-SDAG-NEXT: s_or_b32 s7, s6, 0x1000
+; SI-SDAG-NEXT: s_lshr_b32 s10, s7, s9
+; SI-SDAG-NEXT: s_lshl_b32 s9, s10, s9
+; SI-SDAG-NEXT: s_cmp_lg_u32 s9, s7
+; SI-SDAG-NEXT: s_cselect_b32 s7, 1, 0
+; SI-SDAG-NEXT: s_addk_i32 s8, 0xfc10
+; SI-SDAG-NEXT: s_or_b32 s7, s10, s7
+; SI-SDAG-NEXT: s_lshl_b32 s9, s8, 12
+; SI-SDAG-NEXT: s_or_b32 s9, s6, s9
+; SI-SDAG-NEXT: s_cmp_lt_i32 s8, 1
+; SI-SDAG-NEXT: s_cselect_b32 s7, s7, s9
+; SI-SDAG-NEXT: s_and_b32 s9, s7, 7
+; SI-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; SI-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; SI-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; SI-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; SI-SDAG-NEXT: s_lshr_b32 s7, s7, 2
+; SI-SDAG-NEXT: s_or_b32 s9, s9, s10
+; SI-SDAG-NEXT: s_add_i32 s7, s7, s9
+; SI-SDAG-NEXT: s_cmp_lt_i32 s8, 31
+; SI-SDAG-NEXT: s_cselect_b32 s7, s7, 0x7c00
+; SI-SDAG-NEXT: s_cmp_lg_u32 s6, 0
+; SI-SDAG-NEXT: s_cselect_b32 s0, s0, 0x7c00
+; SI-SDAG-NEXT: s_cmpk_eq_i32 s8, 0x40f
+; SI-SDAG-NEXT: s_cselect_b32 s0, s0, s7
+; SI-SDAG-NEXT: s_lshr_b32 s1, s1, 16
+; SI-SDAG-NEXT: s_and_b32 s1, s1, 0x8000
+; SI-SDAG-NEXT: s_or_b32 s6, s1, s0
+; SI-SDAG-NEXT: s_mov_b32 s0, s4
+; SI-SDAG-NEXT: s_mov_b32 s1, s5
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; SI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-SDAG-NEXT: s_endpgm
+;
+; SI-GISEL-LABEL: fptrunc_f64_to_f16:
+; SI-GISEL: ; %bb.0: ; %entry
+; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SI-GISEL-NEXT: s_mov_b32 s2, -1
+; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT: s_bfe_u32 s3, s5, 0xb0014
+; SI-GISEL-NEXT: s_lshr_b32 s6, s5, 8
+; SI-GISEL-NEXT: s_and_b32 s7, s5, 0x1ff
+; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10
+; SI-GISEL-NEXT: s_and_b32 s6, s6, 0xffe
+; SI-GISEL-NEXT: s_or_b32 s4, s7, s4
+; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
+; SI-GISEL-NEXT: s_or_b32 s4, s6, s4
+; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9
+; SI-GISEL-NEXT: s_lshl_b32 s7, s3, 12
+; SI-GISEL-NEXT: s_sub_i32 s8, 1, s3
+; SI-GISEL-NEXT: s_or_b32 s9, s4, 0x1000
+; SI-GISEL-NEXT: s_or_b32 s6, s6, 0x7c00
+; SI-GISEL-NEXT: s_or_b32 s4, s4, s7
+; SI-GISEL-NEXT: s_max_i32 s7, s8, 0
+; SI-GISEL-NEXT: s_min_i32 s7, s7, 13
+; SI-GISEL-NEXT: s_lshr_b32 s8, s9, s7
+; SI-GISEL-NEXT: s_lshl_b32 s7, s8, s7
+; SI-GISEL-NEXT: s_cmp_lg_u32 s7, s9
+; SI-GISEL-NEXT: s_cselect_b32 s7, 1, 0
+; SI-GISEL-NEXT: s_or_b32 s7, s8, s7
+; SI-GISEL-NEXT: s_cmp_lt_i32 s3, 1
+; SI-GISEL-NEXT: s_cselect_b32 s4, s7, s4
+; SI-GISEL-NEXT: s_and_b32 s7, s4, 7
+; SI-GISEL-NEXT: s_lshr_b32 s4, s4, 2
+; SI-GISEL-NEXT: s_cmp_eq_u32 s7, 3
+; SI-GISEL-NEXT: s_cselect_b32 s8, 1, 0
+; SI-GISEL-NEXT: s_cmp_gt_i32 s7, 5
+; SI-GISEL-NEXT: s_cselect_b32 s7, 1, 0
+; SI-GISEL-NEXT: s_or_b32 s7, s8, s7
+; SI-GISEL-NEXT: s_add_i32 s4, s4, s7
+; SI-GISEL-NEXT: s_cmp_gt_i32 s3, 30
+; SI-GISEL-NEXT: s_cselect_b32 s4, 0x7c00, s4
+; SI-GISEL-NEXT: s_cmpk_eq_i32 s3, 0x40f
+; SI-GISEL-NEXT: s_cselect_b32 s3, s6, s4
+; SI-GISEL-NEXT: s_lshr_b32 s4, s5, 16
+; SI-GISEL-NEXT: s_and_b32 s4, s4, 0x8000
+; SI-GISEL-NEXT: s_or_b32 s4, s4, s3
+; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; SI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-GISEL-NEXT: s_endpgm
+;
+; VI-SDAG-LABEL: fptrunc_f64_to_f16:
+; VI-SDAG: ; %bb.0: ; %entry
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
+; VI-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
+; VI-SDAG-NEXT: s_movk_i32 s6, 0x7e00
+; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT: v_readfirstlane_b32 s4, v1
+; VI-SDAG-NEXT: s_and_b32 s5, s4, 0x1ff
+; VI-SDAG-NEXT: v_or_b32_e32 v0, s5, v0
+; VI-SDAG-NEXT: s_lshr_b32 s7, s4, 8
+; VI-SDAG-NEXT: s_bfe_u32 s8, s4, 0xb0014
+; VI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; VI-SDAG-NEXT: s_and_b32 s5, s7, 0xffe
+; VI-SDAG-NEXT: s_sub_i32 s7, 0x3f1, s8
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-SDAG-NEXT: v_med3_i32 v1, s7, 0, 13
+; VI-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; VI-SDAG-NEXT: s_or_b32 s5, s5, s7
+; VI-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; VI-SDAG-NEXT: s_or_b32 s7, s5, 0x1000
+; VI-SDAG-NEXT: s_lshr_b32 s10, s7, s9
+; VI-SDAG-NEXT: s_lshl_b32 s9, s10, s9
+; VI-SDAG-NEXT: s_cmp_lg_u32 s9, s7
+; VI-SDAG-NEXT: s_cselect_b32 s7, 1, 0
+; VI-SDAG-NEXT: s_addk_i32 s8, 0xfc10
+; VI-SDAG-NEXT: s_lshl_b32 s9, s8, 12
+; VI-SDAG-NEXT: s_or_b32 s7, s10, s7
+; VI-SDAG-NEXT: s_or_b32 s9, s5, s9
+; VI-SDAG-NEXT: s_cmp_lt_i32 s8, 1
+; VI-SDAG-NEXT: s_cselect_b32 s7, s7, s9
+; VI-SDAG-NEXT: s_and_b32 s9, s7, 7
+; VI-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; VI-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; VI-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; VI-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; VI-SDAG-NEXT: s_lshr_b32 s7, s7, 2
+; VI-SDAG-NEXT: s_or_b32 s9, s9, s10
+; VI-SDAG-NEXT: s_add_i32 s7, s7, s9
+; VI-SDAG-NEXT: s_cmp_lt_i32 s8, 31
+; VI-SDAG-NEXT: s_cselect_b32 s7, s7, 0x7c00
+; VI-SDAG-NEXT: s_cmp_lg_u32 s5, 0
+; VI-SDAG-NEXT: s_cselect_b32 s5, s6, 0x7c00
+; VI-SDAG-NEXT: s_cmpk_eq_i32 s8, 0x40f
+; VI-SDAG-NEXT: s_cselect_b32 s5, s5, s7
+; VI-SDAG-NEXT: s_lshr_b32 s4, s4, 16
+; VI-SDAG-NEXT: s_and_b32 s4, s4, 0x8000
+; VI-SDAG-NEXT: s_or_b32 s4, s4, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-SDAG-NEXT: s_endpgm
+;
+; VI-GISEL-LABEL: fptrunc_f64_to_f16:
+; VI-GISEL: ; %bb.0: ; %entry
+; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
+; VI-GISEL-NEXT: s_lshr_b32 s5, s3, 8
+; VI-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
+; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10
+; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
+; VI-GISEL-NEXT: s_or_b32 s2, s6, s2
+; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; VI-GISEL-NEXT: s_or_b32 s2, s5, s2
+; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
+; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4
+; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12
+; VI-GISEL-NEXT: s_max_i32 s7, s7, 0
+; VI-GISEL-NEXT: s_or_b32 s6, s2, s6
+; VI-GISEL-NEXT: s_min_i32 s7, s7, 13
+; VI-GISEL-NEXT: s_bitset1_b32 s2, 12
+; VI-GISEL-NEXT: s_lshl_b32 s5, s5, 9
+; VI-GISEL-NEXT: s_lshr_b32 s8, s2, s7
+; VI-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
+; VI-GISEL-NEXT: s_lshl_b32 s7, s8, s7
+; VI-GISEL-NEXT: s_cmp_lg_u32 s7, s2
+; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; VI-GISEL-NEXT: s_or_b32 s2, s8, s2
+; VI-GISEL-NEXT: s_cmp_lt_i32 s4, 1
+; VI-GISEL-NEXT: s_cselect_b32 s2, s2, s6
+; VI-GISEL-NEXT: s_and_b32 s6, s2, 7
+; VI-GISEL-NEXT: s_lshr_b32 s2, s2, 2
+; VI-GISEL-NEXT: s_cmp_eq_u32 s6, 3
+; VI-GISEL-NEXT: s_cselect_b32 s7, 1, 0
+; VI-GISEL-NEXT: s_cmp_gt_i32 s6, 5
+; VI-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; VI-GISEL-NEXT: s_or_b32 s6, s7, s6
+; VI-GISEL-NEXT: s_add_i32 s2, s2, s6
+; VI-GISEL-NEXT: s_cmp_gt_i32 s4, 30
+; VI-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
+; VI-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
+; VI-GISEL-NEXT: s_cselect_b32 s2, s5, s2
+; VI-GISEL-NEXT: s_lshr_b32 s3, s3, 16
+; VI-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
+; VI-GISEL-NEXT: s_or_b32 s2, s3, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT: s_mov_b32 s2, -1
+; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-GISEL-NEXT: s_endpgm
+;
+; GFX9-SDAG-LABEL: fptrunc_f64_to_f16:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s7, s3
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: s_mov_b32 s4, s10
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s11
+; GFX9-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s8
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s9
+; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7e00
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-SDAG-NEXT: s_and_b32 s6, s5, 0x1ff
+; GFX9-SDAG-NEXT: v_or_b32_e32 v0, s6, v0
+; GFX9-SDAG-NEXT: s_lshr_b32 s7, s5, 8
+; GFX9-SDAG-NEXT: s_bfe_u32 s8, s5, 0xb0014
+; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-SDAG-NEXT: s_and_b32 s6, s7, 0xffe
+; GFX9-SDAG-NEXT: s_sub_i32 s7, 0x3f1, s8
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-SDAG-NEXT: v_med3_i32 v1, s7, 0, 13
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; GFX9-SDAG-NEXT: s_or_b32 s6, s6, s7
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; GFX9-SDAG-NEXT: s_or_b32 s7, s6, 0x1000
+; GFX9-SDAG-NEXT: s_lshr_b32 s10, s7, s9
+; GFX9-SDAG-NEXT: s_lshl_b32 s9, s10, s9
+; GFX9-SDAG-NEXT: s_cmp_lg_u32 s9, s7
+; GFX9-SDAG-NEXT: s_cselect_b32 s7, 1, 0
+; GFX9-SDAG-NEXT: s_addk_i32 s8, 0xfc10
+; GFX9-SDAG-NEXT: s_lshl_b32 s9, s8, 12
+; GFX9-SDAG-NEXT: s_or_b32 s7, s10, s7
+; GFX9-SDAG-NEXT: s_or_b32 s9, s6, s9
+; GFX9-SDAG-NEXT: s_cmp_lt_i32 s8, 1
+; GFX9-SDAG-NEXT: s_cselect_b32 s7, s7, s9
+; GFX9-SDAG-NEXT: s_and_b32 s9, s7, 7
+; GFX9-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; GFX9-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; GFX9-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; GFX9-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; GFX9-SDAG-NEXT: s_lshr_b32 s7, s7, 2
+; GFX9-SDAG-NEXT: s_or_b32 s9, s9, s10
+; GFX9-SDAG-NEXT: s_add_i32 s7, s7, s9
+; GFX9-SDAG-NEXT: s_cmp_lt_i32 s8, 31
+; GFX9-SDAG-NEXT: s_cselect_b32 s7, s7, 0x7c00
+; GFX9-SDAG-NEXT: s_cmp_lg_u32 s6, 0
+; GFX9-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00
+; GFX9-SDAG-NEXT: s_cmpk_eq_i32 s8, 0x40f
+; GFX9-SDAG-NEXT: s_cselect_b32 s4, s4, s7
+; GFX9-SDAG-NEXT: s_lshr_b32 s5, s5, 16
+; GFX9-SDAG-NEXT: s_and_b32 s5, s5, 0x8000
+; GFX9-SDAG-NEXT: s_or_b32 s4, s5, s4
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-SDAG-NEXT: s_endpgm
+;
+; GFX9-GISEL-LABEL: fptrunc_f64_to_f16:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
+; GFX9-GISEL-NEXT: s_lshr_b32 s5, s3, 8
+; GFX9-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX9-GISEL-NEXT: s_addk_i32 s4, 0xfc10
+; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX9-GISEL-NEXT: s_or_b32 s2, s6, s2
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX9-GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GFX9-GISEL-NEXT: s_or_b32 s2, s5, s2
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0
+; GFX9-GISEL-NEXT: s_sub_i32 s7, 1, s4
+; GFX9-GISEL-NEXT: s_lshl_b32 s6, s4, 12
+; GFX9-GISEL-NEXT: s_max_i32 s7, s7, 0
+; GFX9-GISEL-NEXT: s_or_b32 s6, s2, s6
+; GFX9-GISEL-NEXT: s_min_i32 s7, s7, 13
+; GFX9-GISEL-NEXT: s_bitset1_b32 s2, 12
+; GFX9-GISEL-NEXT: s_lshl_b32 s5, s5, 9
+; GFX9-GISEL-NEXT: s_lshr_b32 s8, s2, s7
+; GFX9-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
+; GFX9-GISEL-NEXT: s_lshl_b32 s7, s8, s7
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s7, s2
+; GFX9-GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GFX9-GISEL-NEXT: s_or_b32 s2, s8, s2
+; GFX9-GISEL-NEXT: s_cmp_lt_i32 s4, 1
+; GFX9-GISEL-NEXT: s_cselect_b32 s2, s2, s6
+; GFX9-GISEL-NEXT: s_and_b32 s6, s2, 7
+; GFX9-GISEL-NEXT: s_lshr_b32 s2, s2, 2
+; GFX9-GISEL-NEXT: s_cmp_eq_u32 s6, 3
+; GFX9-GISEL-NEXT: s_cselect_b32 s7, 1, 0
+; GFX9-GISEL-NEXT: s_cmp_gt_i32 s6, 5
+; GFX9-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; GFX9-GISEL-NEXT: s_or_b32 s6, s7, s6
+; GFX9-GISEL-NEXT: s_add_i32 s2, s2, s6
+; GFX9-GISEL-NEXT: s_cmp_gt_i32 s4, 30
+; GFX9-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
+; GFX9-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
+; GFX9-GISEL-NEXT: s_cselect_b32 s2, s5, s2
+; GFX9-GISEL-NEXT: s_lshr_b32 s3, s3, 16
+; GFX9-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
+; GFX9-GISEL-NEXT: s_or_b32 s2, s3, s2
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: fptrunc_f64_to_f16:
+; GFX950-SDAG: ; %bb.0: ; %entry
+; GFX950-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX950-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX950-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX950-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX950-SDAG-NEXT: s_mov_b32 s7, s3
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_mov_b32 s4, s10
+; GFX950-SDAG-NEXT: s_mov_b32 s5, s11
+; GFX950-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX950-SDAG-NEXT: s_mov_b32 s0, s8
+; GFX950-SDAG-NEXT: s_mov_b32 s1, s9
+; GFX950-SDAG-NEXT: s_movk_i32 s4, 0x7e00
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s5, v1
+; GFX950-SDAG-NEXT: s_and_b32 s6, s5, 0x1ff
+; GFX950-SDAG-NEXT: v_or_b32_e32 v0, s6, v0
+; GFX950-SDAG-NEXT: s_lshr_b32 s7, s5, 8
+; GFX950-SDAG-NEXT: s_bfe_u32 s8, s5, 0xb0014
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX950-SDAG-NEXT: s_and_b32 s6, s7, 0xffe
+; GFX950-SDAG-NEXT: s_sub_i32 s7, 0x3f1, s8
+; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-SDAG-NEXT: v_med3_i32 v1, s7, 0, 13
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; GFX950-SDAG-NEXT: s_or_b32 s6, s6, s7
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; GFX950-SDAG-NEXT: s_or_b32 s7, s6, 0x1000
+; GFX950-SDAG-NEXT: s_lshr_b32 s10, s7, s9
+; GFX950-SDAG-NEXT: s_lshl_b32 s9, s10, s9
+; GFX950-SDAG-NEXT: s_cmp_lg_u32 s9, s7
+; GFX950-SDAG-NEXT: s_cselect_b32 s7, 1, 0
+; GFX950-SDAG-NEXT: s_addk_i32 s8, 0xfc10
+; GFX950-SDAG-NEXT: s_lshl_b32 s9, s8, 12
+; GFX950-SDAG-NEXT: s_or_b32 s7, s10, s7
+; GFX950-SDAG-NEXT: s_or_b32 s9, s6, s9
+; GFX950-SDAG-NEXT: s_cmp_lt_i32 s8, 1
+; GFX950-SDAG-NEXT: s_cselect_b32 s7, s7, s9
+; GFX950-SDAG-NEXT: s_and_b32 s9, s7, 7
+; GFX950-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; GFX950-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; GFX950-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; GFX950-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; GFX950-SDAG-NEXT: s_lshr_b32 s7, s7, 2
+; GFX950-SDAG-NEXT: s_or_b32 s9, s9, s10
+; GFX950-SDAG-NEXT: s_add_i32 s7, s7, s9
+; GFX950-SDAG-NEXT: s_cmp_lt_i32 s8, 31
+; GFX950-SDAG-NEXT: s_cselect_b32 s7, s7, 0x7c00
+; GFX950-SDAG-NEXT: s_cmp_lg_u32 s6, 0
+; GFX950-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00
+; GFX950-SDAG-NEXT: s_cmpk_eq_i32 s8, 0x40f
+; GFX950-SDAG-NEXT: s_cselect_b32 s4, s4, s7
+; GFX950-SDAG-NEXT: s_lshr_b32 s5, s5, 16
+; GFX950-SDAG-NEXT: s_and_b32 s5, s5, 0x8000
+; GFX950-SDAG-NEXT: s_or_b32 s4, s5, s4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX950-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX950-SDAG-NEXT: s_endpgm
+;
; GFX950-GISEL-LABEL: fptrunc_f64_to_f16:
; GFX950-GISEL: ; %bb.0: ; %entry
; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX950-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
+; GFX950-GISEL-NEXT: s_lshr_b32 s5, s3, 8
+; GFX950-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX950-GISEL-NEXT: s_addk_i32 s4, 0xfc10
+; GFX950-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX950-GISEL-NEXT: s_or_b32 s2, s6, s2
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX950-GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GFX950-GISEL-NEXT: s_or_b32 s2, s5, s2
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0
+; GFX950-GISEL-NEXT: s_sub_i32 s7, 1, s4
+; GFX950-GISEL-NEXT: s_lshl_b32 s6, s4, 12
+; GFX950-GISEL-NEXT: s_max_i32 s7, s7, 0
+; GFX950-GISEL-NEXT: s_or_b32 s6, s2, s6
+; GFX950-GISEL-NEXT: s_min_i32 s7, s7, 13
+; GFX950-GISEL-NEXT: s_bitset1_b32 s2, 12
+; GFX950-GISEL-NEXT: s_lshl_b32 s5, s5, 9
+; GFX950-GISEL-NEXT: s_lshr_b32 s8, s2, s7
+; GFX950-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
+; GFX950-GISEL-NEXT: s_lshl_b32 s7, s8, s7
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s7, s2
+; GFX950-GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GFX950-GISEL-NEXT: s_or_b32 s2, s8, s2
+; GFX950-GISEL-NEXT: s_cmp_lt_i32 s4, 1
+; GFX950-GISEL-NEXT: s_cselect_b32 s2, s2, s6
+; GFX950-GISEL-NEXT: s_and_b32 s6, s2, 7
+; GFX950-GISEL-NEXT: s_lshr_b32 s2, s2, 2
+; GFX950-GISEL-NEXT: s_cmp_eq_u32 s6, 3
+; GFX950-GISEL-NEXT: s_cselect_b32 s7, 1, 0
+; GFX950-GISEL-NEXT: s_cmp_gt_i32 s6, 5
+; GFX950-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; GFX950-GISEL-NEXT: s_or_b32 s6, s7, s6
+; GFX950-GISEL-NEXT: s_add_i32 s2, s2, s6
+; GFX950-GISEL-NEXT: s_cmp_gt_i32 s4, 30
+; GFX950-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
+; GFX950-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
+; GFX950-GISEL-NEXT: s_cselect_b32 s2, s5, s2
+; GFX950-GISEL-NEXT: s_lshr_b32 s3, s3, 16
+; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
+; GFX950-GISEL-NEXT: s_or_b32 s2, s3, s2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX950-GISEL-NEXT: s_mov_b32 s2, -1
; GFX950-GISEL-NEXT: s_mov_b32 s3, 0xf000
; GFX950-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -340,13 +852,60 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2
; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0
; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
-; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s3, s2, 0x1ff
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s5, s2, 8
+; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, s3, v0
+; GFX11-SDAG-TRUE16-NEXT: s_bfe_u32 s3, s2, 0xb0014
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-SDAG-TRUE16-NEXT: s_sub_i32 s4, 0x3f1, s3
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v1, s4, 0, 13
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s8, v1
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, s4, 0x1000
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s9, s5, s8
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s8, s9, s8
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s8, s5
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_addk_i32 s3, 0xfc10
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, s9, s5
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s8, s3, 12
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s8, s4, s8
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s3, 1
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, s5, s8
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s8, s5, 7
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_gt_i32 s8, 5
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s9, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_eq_u32 s8, 3
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s8, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s5, s5, 2
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_add_i32 s5, s5, s8
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s3, 31
+; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s8, 0x7e00
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s4, s8, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT: s_cmpk_eq_i32 s3, 0x40f
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s4, s5
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s2, s2, 0x8000
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, s2
; GFX11-SDAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-SDAG-TRUE16-NEXT: s_endpgm
;
@@ -360,13 +919,60 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s8, s2
; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0
; GFX11-SDAG-FAKE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
-; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s3, s2, 0x1ff
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s5, s2, 8
+; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, s3, v0
+; GFX11-SDAG-FAKE16-NEXT: s_bfe_u32 s3, s2, 0xb0014
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-SDAG-FAKE16-NEXT: s_sub_i32 s4, 0x3f1, s3
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v1, s4, 0, 13
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s8, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s5, s4, 0x1000
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s9, s5, s8
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s8, s9, s8
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s8, s5
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_addk_i32 s3, 0xfc10
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s5, s9, s5
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s8, s3, 12
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s8, s4, s8
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s3, 1
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, s5, s8
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s8, s5, 7
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_gt_i32 s8, 5
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s9, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_eq_u32 s8, 3
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s8, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s5, s5, 2
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_add_i32 s5, s5, s8
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s3, 31
+; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s8, 0x7e00
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s4, s8, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT: s_cmpk_eq_i32 s3, 0x40f
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s4, s5
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s2, s2, 0x8000
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, s2
; GFX11-SDAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-SDAG-FAKE16-NEXT: s_endpgm
;
@@ -376,6 +982,555 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
+; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s3, 8
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2
+; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s5, s2
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s8, s2, 0x1000
+; GFX11-GISEL-TRUE16-NEXT: s_max_i32 s6, s6, 0
+; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s7, s4, 12
+; GFX11-GISEL-TRUE16-NEXT: s_min_i32 s6, s6, 13
+; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s5, s5, 9
+; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s9, s8, s6
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s2, s7
+; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s6, s9, s6
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s5, s5, 0x7c00
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, s8
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s9, s6
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lt_i32 s4, 1
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s6, s2
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s2, 7
+; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s2, s2, 2
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_eq_u32 s6, 3
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s7, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_gt_i32 s6, 5
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s7, s6
+; GFX11-GISEL-TRUE16-NEXT: s_add_i32 s2, s2, s6
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_gt_i32 s4, 30
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 0x7c00, s2
+; GFX11-GISEL-TRUE16-NEXT: s_cmpk_eq_i32 s4, 0x40f
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s5, s2
+; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s3, 16
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2
+; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX11-GISEL-FAKE16-LABEL: fptrunc_f64_to_f16:
+; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
+; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s3, 8
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2
+; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s5, s2
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s8, s2, 0x1000
+; GFX11-GISEL-FAKE16-NEXT: s_max_i32 s6, s6, 0
+; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s7, s4, 12
+; GFX11-GISEL-FAKE16-NEXT: s_min_i32 s6, s6, 13
+; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s5, s5, 9
+; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s9, s8, s6
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s2, s7
+; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s6, s9, s6
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s5, s5, 0x7c00
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, s8
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s9, s6
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lt_i32 s4, 1
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s6, s2
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s2, 7
+; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s2, s2, 2
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_eq_u32 s6, 3
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s7, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_gt_i32 s6, 5
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s7, s6
+; GFX11-GISEL-FAKE16-NEXT: s_add_i32 s2, s2, s6
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_gt_i32 s4, 30
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 0x7c00, s2
+; GFX11-GISEL-FAKE16-NEXT: s_cmpk_eq_i32 s4, 0x40f
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s5, s2
+; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s3, 16
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2
+; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-FAKE16-NEXT: s_endpgm
+ ptr addrspace(1) %r,
+ ptr addrspace(1) %a) {
+entry:
+ %a.val = load double, ptr addrspace(1) %a
+ %r.val = fptrunc double %a.val to half
+ store half %r.val, ptr addrspace(1) %r
+ ret void
+}
+
+define amdgpu_kernel void @fptrunc_f64_to_f16_afn(
+; SI-SDAG-LABEL: fptrunc_f64_to_f16_afn:
+; SI-SDAG: ; %bb.0: ; %entry
+; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT: s_mov_b32 s2, -1
+; SI-SDAG-NEXT: s_mov_b32 s10, s2
+; SI-SDAG-NEXT: s_mov_b32 s11, s3
+; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT: s_mov_b32 s8, s6
+; SI-SDAG-NEXT: s_mov_b32 s9, s7
+; SI-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SI-SDAG-NEXT: s_movk_i32 s0, 0x7e00
+; SI-SDAG-NEXT: s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT: v_readfirstlane_b32 s1, v1
+; SI-SDAG-NEXT: s_and_b32 s6, s1, 0x1ff
+; SI-SDAG-NEXT: s_lshr_b32 s7, s1, 8
+; SI-SDAG-NEXT: s_bfe_u32 s8, s1, 0xb0014
+; SI-SDAG-NEXT: v_or_b32_e32 v0, s6, v0
+; SI-SDAG-NEXT: s_and_b32 s6, s7, 0xffe
+; SI-SDAG-NEXT: s_sub_i32 s7, 0x3f1, s8
+; SI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-SDAG-NEXT: v_med3_i32 v1, s7, 0, 13
+; SI-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; SI-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; SI-SDAG-NEXT: s_or_b32 s6, s6, s7
+; SI-SDAG-NEXT: s_or_b32 s7, s6, 0x1000
+; SI-SDAG-NEXT: s_lshr_b32 s10, s7, s9
+; SI-SDAG-NEXT: s_lshl_b32 s9, s10, s9
+; SI-SDAG-NEXT: s_cmp_lg_u32 s9, s7
+; SI-SDAG-NEXT: s_cselect_b32 s7, 1, 0
+; SI-SDAG-NEXT: s_addk_i32 s8, 0xfc10
+; SI-SDAG-NEXT: s_or_b32 s7, s10, s7
+; SI-SDAG-NEXT: s_lshl_b32 s9, s8, 12
+; SI-SDAG-NEXT: s_or_b32 s9, s6, s9
+; SI-SDAG-NEXT: s_cmp_lt_i32 s8, 1
+; SI-SDAG-NEXT: s_cselect_b32 s7, s7, s9
+; SI-SDAG-NEXT: s_and_b32 s9, s7, 7
+; SI-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; SI-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; SI-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; SI-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; SI-SDAG-NEXT: s_lshr_b32 s7, s7, 2
+; SI-SDAG-NEXT: s_or_b32 s9, s9, s10
+; SI-SDAG-NEXT: s_add_i32 s7, s7, s9
+; SI-SDAG-NEXT: s_cmp_lt_i32 s8, 31
+; SI-SDAG-NEXT: s_cselect_b32 s7, s7, 0x7c00
+; SI-SDAG-NEXT: s_cmp_lg_u32 s6, 0
+; SI-SDAG-NEXT: s_cselect_b32 s0, s0, 0x7c00
+; SI-SDAG-NEXT: s_cmpk_eq_i32 s8, 0x40f
+; SI-SDAG-NEXT: s_cselect_b32 s0, s0, s7
+; SI-SDAG-NEXT: s_lshr_b32 s1, s1, 16
+; SI-SDAG-NEXT: s_and_b32 s1, s1, 0x8000
+; SI-SDAG-NEXT: s_or_b32 s6, s1, s0
+; SI-SDAG-NEXT: s_mov_b32 s0, s4
+; SI-SDAG-NEXT: s_mov_b32 s1, s5
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; SI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-SDAG-NEXT: s_endpgm
+;
+; SI-GISEL-LABEL: fptrunc_f64_to_f16_afn:
+; SI-GISEL: ; %bb.0: ; %entry
+; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SI-GISEL-NEXT: s_mov_b32 s2, -1
+; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-GISEL-NEXT: s_endpgm
+;
+; VI-SDAG-LABEL: fptrunc_f64_to_f16_afn:
+; VI-SDAG: ; %bb.0: ; %entry
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
+; VI-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
+; VI-SDAG-NEXT: s_movk_i32 s6, 0x7e00
+; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT: v_readfirstlane_b32 s4, v1
+; VI-SDAG-NEXT: s_and_b32 s5, s4, 0x1ff
+; VI-SDAG-NEXT: v_or_b32_e32 v0, s5, v0
+; VI-SDAG-NEXT: s_lshr_b32 s7, s4, 8
+; VI-SDAG-NEXT: s_bfe_u32 s8, s4, 0xb0014
+; VI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; VI-SDAG-NEXT: s_and_b32 s5, s7, 0xffe
+; VI-SDAG-NEXT: s_sub_i32 s7, 0x3f1, s8
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-SDAG-NEXT: v_med3_i32 v1, s7, 0, 13
+; VI-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; VI-SDAG-NEXT: s_or_b32 s5, s5, s7
+; VI-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; VI-SDAG-NEXT: s_or_b32 s7, s5, 0x1000
+; VI-SDAG-NEXT: s_lshr_b32 s10, s7, s9
+; VI-SDAG-NEXT: s_lshl_b32 s9, s10, s9
+; VI-SDAG-NEXT: s_cmp_lg_u32 s9, s7
+; VI-SDAG-NEXT: s_cselect_b32 s7, 1, 0
+; VI-SDAG-NEXT: s_addk_i32 s8, 0xfc10
+; VI-SDAG-NEXT: s_lshl_b32 s9, s8, 12
+; VI-SDAG-NEXT: s_or_b32 s7, s10, s7
+; VI-SDAG-NEXT: s_or_b32 s9, s5, s9
+; VI-SDAG-NEXT: s_cmp_lt_i32 s8, 1
+; VI-SDAG-NEXT: s_cselect_b32 s7, s7, s9
+; VI-SDAG-NEXT: s_and_b32 s9, s7, 7
+; VI-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; VI-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; VI-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; VI-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; VI-SDAG-NEXT: s_lshr_b32 s7, s7, 2
+; VI-SDAG-NEXT: s_or_b32 s9, s9, s10
+; VI-SDAG-NEXT: s_add_i32 s7, s7, s9
+; VI-SDAG-NEXT: s_cmp_lt_i32 s8, 31
+; VI-SDAG-NEXT: s_cselect_b32 s7, s7, 0x7c00
+; VI-SDAG-NEXT: s_cmp_lg_u32 s5, 0
+; VI-SDAG-NEXT: s_cselect_b32 s5, s6, 0x7c00
+; VI-SDAG-NEXT: s_cmpk_eq_i32 s8, 0x40f
+; VI-SDAG-NEXT: s_cselect_b32 s5, s5, s7
+; VI-SDAG-NEXT: s_lshr_b32 s4, s4, 16
+; VI-SDAG-NEXT: s_and_b32 s4, s4, 0x8000
+; VI-SDAG-NEXT: s_or_b32 s4, s4, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-SDAG-NEXT: s_endpgm
+;
+; VI-GISEL-LABEL: fptrunc_f64_to_f16_afn:
+; VI-GISEL: ; %bb.0: ; %entry
+; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; VI-GISEL-NEXT: s_mov_b32 s2, -1
+; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-GISEL-NEXT: s_endpgm
+;
+; GFX9-SDAG-LABEL: fptrunc_f64_to_f16_afn:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s7, s3
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: s_mov_b32 s4, s10
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s11
+; GFX9-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s8
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s9
+; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7e00
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-SDAG-NEXT: s_and_b32 s6, s5, 0x1ff
+; GFX9-SDAG-NEXT: v_or_b32_e32 v0, s6, v0
+; GFX9-SDAG-NEXT: s_lshr_b32 s7, s5, 8
+; GFX9-SDAG-NEXT: s_bfe_u32 s8, s5, 0xb0014
+; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-SDAG-NEXT: s_and_b32 s6, s7, 0xffe
+; GFX9-SDAG-NEXT: s_sub_i32 s7, 0x3f1, s8
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-SDAG-NEXT: v_med3_i32 v1, s7, 0, 13
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; GFX9-SDAG-NEXT: s_or_b32 s6, s6, s7
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; GFX9-SDAG-NEXT: s_or_b32 s7, s6, 0x1000
+; GFX9-SDAG-NEXT: s_lshr_b32 s10, s7, s9
+; GFX9-SDAG-NEXT: s_lshl_b32 s9, s10, s9
+; GFX9-SDAG-NEXT: s_cmp_lg_u32 s9, s7
+; GFX9-SDAG-NEXT: s_cselect_b32 s7, 1, 0
+; GFX9-SDAG-NEXT: s_addk_i32 s8, 0xfc10
+; GFX9-SDAG-NEXT: s_lshl_b32 s9, s8, 12
+; GFX9-SDAG-NEXT: s_or_b32 s7, s10, s7
+; GFX9-SDAG-NEXT: s_or_b32 s9, s6, s9
+; GFX9-SDAG-NEXT: s_cmp_lt_i32 s8, 1
+; GFX9-SDAG-NEXT: s_cselect_b32 s7, s7, s9
+; GFX9-SDAG-NEXT: s_and_b32 s9, s7, 7
+; GFX9-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; GFX9-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; GFX9-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; GFX9-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; GFX9-SDAG-NEXT: s_lshr_b32 s7, s7, 2
+; GFX9-SDAG-NEXT: s_or_b32 s9, s9, s10
+; GFX9-SDAG-NEXT: s_add_i32 s7, s7, s9
+; GFX9-SDAG-NEXT: s_cmp_lt_i32 s8, 31
+; GFX9-SDAG-NEXT: s_cselect_b32 s7, s7, 0x7c00
+; GFX9-SDAG-NEXT: s_cmp_lg_u32 s6, 0
+; GFX9-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00
+; GFX9-SDAG-NEXT: s_cmpk_eq_i32 s8, 0x40f
+; GFX9-SDAG-NEXT: s_cselect_b32 s4, s4, s7
+; GFX9-SDAG-NEXT: s_lshr_b32 s5, s5, 16
+; GFX9-SDAG-NEXT: s_and_b32 s5, s5, 0x8000
+; GFX9-SDAG-NEXT: s_or_b32 s4, s5, s4
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-SDAG-NEXT: s_endpgm
+;
+; GFX9-GISEL-LABEL: fptrunc_f64_to_f16_afn:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: fptrunc_f64_to_f16_afn:
+; GFX950-SDAG: ; %bb.0: ; %entry
+; GFX950-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX950-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX950-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX950-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX950-SDAG-NEXT: s_mov_b32 s7, s3
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_mov_b32 s4, s10
+; GFX950-SDAG-NEXT: s_mov_b32 s5, s11
+; GFX950-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX950-SDAG-NEXT: s_mov_b32 s0, s8
+; GFX950-SDAG-NEXT: s_mov_b32 s1, s9
+; GFX950-SDAG-NEXT: s_movk_i32 s4, 0x7e00
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s5, v1
+; GFX950-SDAG-NEXT: s_and_b32 s6, s5, 0x1ff
+; GFX950-SDAG-NEXT: v_or_b32_e32 v0, s6, v0
+; GFX950-SDAG-NEXT: s_lshr_b32 s7, s5, 8
+; GFX950-SDAG-NEXT: s_bfe_u32 s8, s5, 0xb0014
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX950-SDAG-NEXT: s_and_b32 s6, s7, 0xffe
+; GFX950-SDAG-NEXT: s_sub_i32 s7, 0x3f1, s8
+; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-SDAG-NEXT: v_med3_i32 v1, s7, 0, 13
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; GFX950-SDAG-NEXT: s_or_b32 s6, s6, s7
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; GFX950-SDAG-NEXT: s_or_b32 s7, s6, 0x1000
+; GFX950-SDAG-NEXT: s_lshr_b32 s10, s7, s9
+; GFX950-SDAG-NEXT: s_lshl_b32 s9, s10, s9
+; GFX950-SDAG-NEXT: s_cmp_lg_u32 s9, s7
+; GFX950-SDAG-NEXT: s_cselect_b32 s7, 1, 0
+; GFX950-SDAG-NEXT: s_addk_i32 s8, 0xfc10
+; GFX950-SDAG-NEXT: s_lshl_b32 s9, s8, 12
+; GFX950-SDAG-NEXT: s_or_b32 s7, s10, s7
+; GFX950-SDAG-NEXT: s_or_b32 s9, s6, s9
+; GFX950-SDAG-NEXT: s_cmp_lt_i32 s8, 1
+; GFX950-SDAG-NEXT: s_cselect_b32 s7, s7, s9
+; GFX950-SDAG-NEXT: s_and_b32 s9, s7, 7
+; GFX950-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; GFX950-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; GFX950-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; GFX950-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; GFX950-SDAG-NEXT: s_lshr_b32 s7, s7, 2
+; GFX950-SDAG-NEXT: s_or_b32 s9, s9, s10
+; GFX950-SDAG-NEXT: s_add_i32 s7, s7, s9
+; GFX950-SDAG-NEXT: s_cmp_lt_i32 s8, 31
+; GFX950-SDAG-NEXT: s_cselect_b32 s7, s7, 0x7c00
+; GFX950-SDAG-NEXT: s_cmp_lg_u32 s6, 0
+; GFX950-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00
+; GFX950-SDAG-NEXT: s_cmpk_eq_i32 s8, 0x40f
+; GFX950-SDAG-NEXT: s_cselect_b32 s4, s4, s7
+; GFX950-SDAG-NEXT: s_lshr_b32 s5, s5, 16
+; GFX950-SDAG-NEXT: s_and_b32 s5, s5, 0x8000
+; GFX950-SDAG-NEXT: s_or_b32 s4, s5, s4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX950-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: fptrunc_f64_to_f16_afn:
+; GFX950-GISEL: ; %bb.0: ; %entry
+; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX950-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX950-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX950-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-TRUE16-LABEL: fptrunc_f64_to_f16_afn:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s3, s2, 0x1ff
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s5, s2, 8
+; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, s3, v0
+; GFX11-SDAG-TRUE16-NEXT: s_bfe_u32 s3, s2, 0xb0014
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-SDAG-TRUE16-NEXT: s_sub_i32 s4, 0x3f1, s3
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v1, s4, 0, 13
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s8, v1
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, s4, 0x1000
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s9, s5, s8
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s8, s9, s8
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s8, s5
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_addk_i32 s3, 0xfc10
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, s9, s5
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s8, s3, 12
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s8, s4, s8
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s3, 1
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, s5, s8
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s8, s5, 7
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_gt_i32 s8, 5
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s9, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_eq_u32 s8, 3
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s8, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s5, s5, 2
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_add_i32 s5, s5, s8
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s3, 31
+; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s8, 0x7e00
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s4, s8, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT: s_cmpk_eq_i32 s3, 0x40f
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s4, s5
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s2, s2, 0x8000
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SDAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: fptrunc_f64_to_f16_afn:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-SDAG-FAKE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s3, s2, 0x1ff
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s5, s2, 8
+; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, s3, v0
+; GFX11-SDAG-FAKE16-NEXT: s_bfe_u32 s3, s2, 0xb0014
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-SDAG-FAKE16-NEXT: s_sub_i32 s4, 0x3f1, s3
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v1, s4, 0, 13
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s8, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s5, s4, 0x1000
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s9, s5, s8
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s8, s9, s8
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s8, s5
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_addk_i32 s3, 0xfc10
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s5, s9, s5
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s8, s3, 12
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s8, s4, s8
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s3, 1
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, s5, s8
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s8, s5, 7
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_gt_i32 s8, 5
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s9, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_eq_u32 s8, 3
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s8, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s5, s5, 2
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_add_i32 s5, s5, s8
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s3, 31
+; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s8, 0x7e00
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s4, s8, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT: s_cmpk_eq_i32 s3, 0x40f
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s4, s5
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s2, s2, 0x8000
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SDAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-FAKE16-NEXT: s_endpgm
+;
+; GFX11-GISEL-TRUE16-LABEL: fptrunc_f64_to_f16_afn:
+; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
@@ -384,7 +1539,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX11-GISEL-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-GISEL-TRUE16-NEXT: s_endpgm
;
-; GFX11-GISEL-FAKE16-LABEL: fptrunc_f64_to_f16:
+; GFX11-GISEL-FAKE16-LABEL: fptrunc_f64_to_f16_afn:
; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry
; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
@@ -401,7 +1556,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
ptr addrspace(1) %a) {
entry:
%a.val = load double, ptr addrspace(1) %a
- %r.val = fptrunc double %a.val to half
+ %r.val = fptrunc afn double %a.val to half
store half %r.val, ptr addrspace(1) %r
ret void
}
@@ -626,25 +1781,106 @@ entry:
define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; SI-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
; SI-SDAG: ; %bb.0: ; %entry
-; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; SI-SDAG-NEXT: s_mov_b32 s6, -1
-; SI-SDAG-NEXT: s_mov_b32 s10, s6
-; SI-SDAG-NEXT: s_mov_b32 s11, s7
+; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT: s_mov_b32 s2, -1
+; SI-SDAG-NEXT: s_mov_b32 s10, s2
+; SI-SDAG-NEXT: s_mov_b32 s11, s3
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT: s_mov_b32 s8, s2
-; SI-SDAG-NEXT: s_mov_b32 s9, s3
+; SI-SDAG-NEXT: s_mov_b32 s8, s6
+; SI-SDAG-NEXT: s_mov_b32 s9, s7
; SI-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; SI-SDAG-NEXT: s_mov_b32 s4, s0
-; SI-SDAG-NEXT: s_mov_b32 s5, s1
+; SI-SDAG-NEXT: s_movk_i32 s0, 0x7e00
; SI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
-; SI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v2
-; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; SI-SDAG-NEXT: v_readfirstlane_b32 s6, v1
+; SI-SDAG-NEXT: s_and_b32 s7, s1, 0x1ff
+; SI-SDAG-NEXT: s_lshr_b32 s8, s1, 8
+; SI-SDAG-NEXT: s_bfe_u32 s9, s1, 0xb0014
+; SI-SDAG-NEXT: v_or_b32_e32 v1, s7, v2
+; SI-SDAG-NEXT: s_and_b32 s7, s8, 0xffe
+; SI-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s9
+; SI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-SDAG-NEXT: v_med3_i32 v2, s8, 0, 13
+; SI-SDAG-NEXT: v_readfirstlane_b32 s8, v1
+; SI-SDAG-NEXT: v_readfirstlane_b32 s10, v2
+; SI-SDAG-NEXT: s_or_b32 s7, s7, s8
+; SI-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; SI-SDAG-NEXT: s_lshr_b32 s11, s8, s10
+; SI-SDAG-NEXT: s_lshl_b32 s10, s11, s10
+; SI-SDAG-NEXT: s_cmp_lg_u32 s10, s8
+; SI-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; SI-SDAG-NEXT: s_addk_i32 s9, 0xfc10
+; SI-SDAG-NEXT: s_or_b32 s8, s11, s8
+; SI-SDAG-NEXT: s_lshl_b32 s10, s9, 12
+; SI-SDAG-NEXT: s_or_b32 s10, s7, s10
+; SI-SDAG-NEXT: s_cmp_lt_i32 s9, 1
+; SI-SDAG-NEXT: s_cselect_b32 s8, s8, s10
+; SI-SDAG-NEXT: s_and_b32 s10, s8, 7
+; SI-SDAG-NEXT: s_cmp_gt_i32 s10, 5
+; SI-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; SI-SDAG-NEXT: s_cmp_eq_u32 s10, 3
+; SI-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; SI-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; SI-SDAG-NEXT: s_or_b32 s10, s10, s11
+; SI-SDAG-NEXT: s_add_i32 s8, s8, s10
+; SI-SDAG-NEXT: s_cmp_lt_i32 s9, 31
+; SI-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; SI-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; SI-SDAG-NEXT: s_cselect_b32 s7, s0, 0x7c00
+; SI-SDAG-NEXT: s_cmpk_eq_i32 s9, 0x40f
+; SI-SDAG-NEXT: s_cselect_b32 s7, s7, s8
+; SI-SDAG-NEXT: s_lshr_b32 s1, s1, 16
+; SI-SDAG-NEXT: s_and_b32 s8, s6, 0x1ff
+; SI-SDAG-NEXT: s_lshr_b32 s9, s6, 8
+; SI-SDAG-NEXT: s_bfe_u32 s10, s6, 0xb0014
+; SI-SDAG-NEXT: s_and_b32 s1, s1, 0x8000
+; SI-SDAG-NEXT: v_or_b32_e32 v0, s8, v0
+; SI-SDAG-NEXT: s_and_b32 s8, s9, 0xffe
+; SI-SDAG-NEXT: s_sub_i32 s9, 0x3f1, s10
+; SI-SDAG-NEXT: s_or_b32 s1, s1, s7
+; SI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-SDAG-NEXT: v_med3_i32 v1, s9, 0, 13
+; SI-SDAG-NEXT: s_lshl_b32 s1, s1, 16
+; SI-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; SI-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; SI-SDAG-NEXT: s_or_b32 s7, s8, s7
+; SI-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; SI-SDAG-NEXT: s_lshr_b32 s11, s8, s9
+; SI-SDAG-NEXT: s_lshl_b32 s9, s11, s9
+; SI-SDAG-NEXT: s_cmp_lg_u32 s9, s8
+; SI-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; SI-SDAG-NEXT: s_addk_i32 s10, 0xfc10
+; SI-SDAG-NEXT: s_or_b32 s8, s11, s8
+; SI-SDAG-NEXT: s_lshl_b32 s9, s10, 12
+; SI-SDAG-NEXT: s_or_b32 s9, s7, s9
+; SI-SDAG-NEXT: s_cmp_lt_i32 s10, 1
+; SI-SDAG-NEXT: s_cselect_b32 s8, s8, s9
+; SI-SDAG-NEXT: s_and_b32 s9, s8, 7
+; SI-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; SI-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; SI-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; SI-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; SI-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; SI-SDAG-NEXT: s_or_b32 s9, s9, s11
+; SI-SDAG-NEXT: s_add_i32 s8, s8, s9
+; SI-SDAG-NEXT: s_cmp_lt_i32 s10, 31
+; SI-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; SI-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; SI-SDAG-NEXT: s_cselect_b32 s0, s0, 0x7c00
+; SI-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f
+; SI-SDAG-NEXT: s_cselect_b32 s0, s0, s8
+; SI-SDAG-NEXT: s_lshr_b32 s6, s6, 16
+; SI-SDAG-NEXT: s_and_b32 s6, s6, 0x8000
+; SI-SDAG-NEXT: s_or_b32 s0, s6, s0
+; SI-SDAG-NEXT: s_and_b32 s0, s0, 0xffff
+; SI-SDAG-NEXT: s_or_b32 s6, s0, s1
+; SI-SDAG-NEXT: s_mov_b32 s0, s4
+; SI-SDAG-NEXT: s_mov_b32 s1, s5
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; SI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-SDAG-NEXT: s_endpgm
;
; SI-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
@@ -654,6 +1890,1251 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; SI-GISEL-NEXT: s_mov_b32 s2, -1
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT: s_bfe_u32 s3, s5, 0xb0014
+; SI-GISEL-NEXT: s_lshr_b32 s8, s5, 8
+; SI-GISEL-NEXT: s_and_b32 s9, s5, 0x1ff
+; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10
+; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe
+; SI-GISEL-NEXT: s_or_b32 s4, s9, s4
+; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
+; SI-GISEL-NEXT: s_or_b32 s4, s8, s4
+; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; SI-GISEL-NEXT: s_cselect_b32 s8, 1, 0
+; SI-GISEL-NEXT: s_lshl_b32 s8, s8, 9
+; SI-GISEL-NEXT: s_lshl_b32 s9, s3, 12
+; SI-GISEL-NEXT: s_sub_i32 s10, 1, s3
+; SI-GISEL-NEXT: s_or_b32 s11, s4, 0x1000
+; SI-GISEL-NEXT: s_or_b32 s8, s8, 0x7c00
+; SI-GISEL-NEXT: s_or_b32 s4, s4, s9
+; SI-GISEL-NEXT: s_max_i32 s9, s10, 0
+; SI-GISEL-NEXT: s_min_i32 s9, s9, 13
+; SI-GISEL-NEXT: s_lshr_b32 s10, s11, s9
+; SI-GISEL-NEXT: s_lshl_b32 s9, s10, s9
+; SI-GISEL-NEXT: s_cmp_lg_u32 s9, s11
+; SI-GISEL-NEXT: s_cselect_b32 s9, 1, 0
+; SI-GISEL-NEXT: s_or_b32 s9, s10, s9
+; SI-GISEL-NEXT: s_cmp_lt_i32 s3, 1
+; SI-GISEL-NEXT: s_cselect_b32 s4, s9, s4
+; SI-GISEL-NEXT: s_and_b32 s9, s4, 7
+; SI-GISEL-NEXT: s_lshr_b32 s4, s4, 2
+; SI-GISEL-NEXT: s_cmp_eq_u32 s9, 3
+; SI-GISEL-NEXT: s_cselect_b32 s10, 1, 0
+; SI-GISEL-NEXT: s_cmp_gt_i32 s9, 5
+; SI-GISEL-NEXT: s_cselect_b32 s9, 1, 0
+; SI-GISEL-NEXT: s_or_b32 s9, s10, s9
+; SI-GISEL-NEXT: s_add_i32 s4, s4, s9
+; SI-GISEL-NEXT: s_cmp_gt_i32 s3, 30
+; SI-GISEL-NEXT: s_cselect_b32 s4, 0x7c00, s4
+; SI-GISEL-NEXT: s_cmpk_eq_i32 s3, 0x40f
+; SI-GISEL-NEXT: s_cselect_b32 s3, s8, s4
+; SI-GISEL-NEXT: s_lshr_b32 s4, s5, 16
+; SI-GISEL-NEXT: s_bfe_u32 s5, s7, 0xb0014
+; SI-GISEL-NEXT: s_lshr_b32 s8, s7, 8
+; SI-GISEL-NEXT: s_and_b32 s9, s7, 0x1ff
+; SI-GISEL-NEXT: s_and_b32 s4, s4, 0x8000
+; SI-GISEL-NEXT: s_addk_i32 s5, 0xfc10
+; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe
+; SI-GISEL-NEXT: s_or_b32 s6, s9, s6
+; SI-GISEL-NEXT: s_or_b32 s3, s4, s3
+; SI-GISEL-NEXT: s_cmp_lg_u32 s6, 0
+; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
+; SI-GISEL-NEXT: s_or_b32 s4, s8, s4
+; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9
+; SI-GISEL-NEXT: s_lshl_b32 s8, s5, 12
+; SI-GISEL-NEXT: s_sub_i32 s9, 1, s5
+; SI-GISEL-NEXT: s_or_b32 s10, s4, 0x1000
+; SI-GISEL-NEXT: s_or_b32 s6, s6, 0x7c00
+; SI-GISEL-NEXT: s_or_b32 s4, s4, s8
+; SI-GISEL-NEXT: s_max_i32 s8, s9, 0
+; SI-GISEL-NEXT: s_min_i32 s8, s8, 13
+; SI-GISEL-NEXT: s_lshr_b32 s9, s10, s8
+; SI-GISEL-NEXT: s_lshl_b32 s8, s9, s8
+; SI-GISEL-NEXT: s_cmp_lg_u32 s8, s10
+; SI-GISEL-NEXT: s_cselect_b32 s8, 1, 0
+; SI-GISEL-NEXT: s_or_b32 s8, s9, s8
+; SI-GISEL-NEXT: s_cmp_lt_i32 s5, 1
+; SI-GISEL-NEXT: s_cselect_b32 s4, s8, s4
+; SI-GISEL-NEXT: s_and_b32 s8, s4, 7
+; SI-GISEL-NEXT: s_lshr_b32 s4, s4, 2
+; SI-GISEL-NEXT: s_cmp_eq_u32 s8, 3
+; SI-GISEL-NEXT: s_cselect_b32 s9, 1, 0
+; SI-GISEL-NEXT: s_cmp_gt_i32 s8, 5
+; SI-GISEL-NEXT: s_cselect_b32 s8, 1, 0
+; SI-GISEL-NEXT: s_or_b32 s8, s9, s8
+; SI-GISEL-NEXT: s_add_i32 s4, s4, s8
+; SI-GISEL-NEXT: s_cmp_gt_i32 s5, 30
+; SI-GISEL-NEXT: s_cselect_b32 s4, 0x7c00, s4
+; SI-GISEL-NEXT: s_cmpk_eq_i32 s5, 0x40f
+; SI-GISEL-NEXT: s_cselect_b32 s4, s6, s4
+; SI-GISEL-NEXT: s_lshr_b32 s5, s7, 16
+; SI-GISEL-NEXT: s_and_b32 s3, s3, 0xffff
+; SI-GISEL-NEXT: s_and_b32 s5, s5, 0x8000
+; SI-GISEL-NEXT: s_or_b32 s4, s5, s4
+; SI-GISEL-NEXT: s_and_b32 s4, s4, 0xffff
+; SI-GISEL-NEXT: s_lshl_b32 s4, s4, 16
+; SI-GISEL-NEXT: s_or_b32 s4, s3, s4
+; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-GISEL-NEXT: s_endpgm
+;
+; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
+; VI-SDAG: ; %bb.0: ; %entry
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
+; VI-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
+; VI-SDAG-NEXT: s_movk_i32 s6, 0x7e00
+; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; VI-SDAG-NEXT: s_and_b32 s7, s4, 0x1ff
+; VI-SDAG-NEXT: v_readfirstlane_b32 s5, v1
+; VI-SDAG-NEXT: v_or_b32_e32 v1, s7, v2
+; VI-SDAG-NEXT: s_lshr_b32 s8, s4, 8
+; VI-SDAG-NEXT: s_bfe_u32 s9, s4, 0xb0014
+; VI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; VI-SDAG-NEXT: s_and_b32 s7, s8, 0xffe
+; VI-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s9
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-SDAG-NEXT: v_med3_i32 v2, s8, 0, 13
+; VI-SDAG-NEXT: v_readfirstlane_b32 s8, v1
+; VI-SDAG-NEXT: s_or_b32 s7, s7, s8
+; VI-SDAG-NEXT: v_readfirstlane_b32 s10, v2
+; VI-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; VI-SDAG-NEXT: s_lshr_b32 s11, s8, s10
+; VI-SDAG-NEXT: s_lshl_b32 s10, s11, s10
+; VI-SDAG-NEXT: s_cmp_lg_u32 s10, s8
+; VI-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; VI-SDAG-NEXT: s_addk_i32 s9, 0xfc10
+; VI-SDAG-NEXT: s_lshl_b32 s10, s9, 12
+; VI-SDAG-NEXT: s_or_b32 s8, s11, s8
+; VI-SDAG-NEXT: s_or_b32 s10, s7, s10
+; VI-SDAG-NEXT: s_cmp_lt_i32 s9, 1
+; VI-SDAG-NEXT: s_cselect_b32 s8, s8, s10
+; VI-SDAG-NEXT: s_and_b32 s10, s8, 7
+; VI-SDAG-NEXT: s_cmp_gt_i32 s10, 5
+; VI-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; VI-SDAG-NEXT: s_cmp_eq_u32 s10, 3
+; VI-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; VI-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; VI-SDAG-NEXT: s_or_b32 s10, s10, s11
+; VI-SDAG-NEXT: s_add_i32 s8, s8, s10
+; VI-SDAG-NEXT: s_cmp_lt_i32 s9, 31
+; VI-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; VI-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; VI-SDAG-NEXT: s_cselect_b32 s7, s6, 0x7c00
+; VI-SDAG-NEXT: s_cmpk_eq_i32 s9, 0x40f
+; VI-SDAG-NEXT: s_cselect_b32 s7, s7, s8
+; VI-SDAG-NEXT: s_and_b32 s8, s5, 0x1ff
+; VI-SDAG-NEXT: v_or_b32_e32 v0, s8, v0
+; VI-SDAG-NEXT: s_lshr_b32 s4, s4, 16
+; VI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; VI-SDAG-NEXT: s_lshr_b32 s9, s5, 8
+; VI-SDAG-NEXT: s_bfe_u32 s10, s5, 0xb0014
+; VI-SDAG-NEXT: s_and_b32 s4, s4, 0x8000
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-SDAG-NEXT: s_and_b32 s8, s9, 0xffe
+; VI-SDAG-NEXT: s_sub_i32 s9, 0x3f1, s10
+; VI-SDAG-NEXT: s_or_b32 s4, s4, s7
+; VI-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; VI-SDAG-NEXT: v_med3_i32 v1, s9, 0, 13
+; VI-SDAG-NEXT: s_or_b32 s7, s8, s7
+; VI-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; VI-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; VI-SDAG-NEXT: s_lshr_b32 s11, s8, s9
+; VI-SDAG-NEXT: s_lshl_b32 s4, s4, 16
+; VI-SDAG-NEXT: s_lshl_b32 s9, s11, s9
+; VI-SDAG-NEXT: s_cmp_lg_u32 s9, s8
+; VI-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; VI-SDAG-NEXT: s_addk_i32 s10, 0xfc10
+; VI-SDAG-NEXT: s_lshl_b32 s9, s10, 12
+; VI-SDAG-NEXT: s_or_b32 s8, s11, s8
+; VI-SDAG-NEXT: s_or_b32 s9, s7, s9
+; VI-SDAG-NEXT: s_cmp_lt_i32 s10, 1
+; VI-SDAG-NEXT: s_cselect_b32 s8, s8, s9
+; VI-SDAG-NEXT: s_and_b32 s9, s8, 7
+; VI-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; VI-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; VI-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; VI-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; VI-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; VI-SDAG-NEXT: s_or_b32 s9, s9, s11
+; VI-SDAG-NEXT: s_add_i32 s8, s8, s9
+; VI-SDAG-NEXT: s_cmp_lt_i32 s10, 31
+; VI-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; VI-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; VI-SDAG-NEXT: s_cselect_b32 s6, s6, 0x7c00
+; VI-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f
+; VI-SDAG-NEXT: s_cselect_b32 s6, s6, s8
+; VI-SDAG-NEXT: s_lshr_b32 s5, s5, 16
+; VI-SDAG-NEXT: s_and_b32 s5, s5, 0x8000
+; VI-SDAG-NEXT: s_or_b32 s5, s5, s6
+; VI-SDAG-NEXT: s_and_b32 s5, s5, 0xffff
+; VI-SDAG-NEXT: s_or_b32 s4, s5, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-SDAG-NEXT: s_endpgm
+;
+; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
+; VI-GISEL: ; %bb.0: ; %entry
+; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: s_bfe_u32 s2, s5, 0xb0014
+; VI-GISEL-NEXT: s_lshr_b32 s3, s5, 8
+; VI-GISEL-NEXT: s_and_b32 s8, s5, 0x1ff
+; VI-GISEL-NEXT: s_addk_i32 s2, 0xfc10
+; VI-GISEL-NEXT: s_and_b32 s3, s3, 0xffe
+; VI-GISEL-NEXT: s_or_b32 s4, s8, s4
+; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
+; VI-GISEL-NEXT: s_or_b32 s3, s3, s4
+; VI-GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
+; VI-GISEL-NEXT: s_sub_i32 s9, 1, s2
+; VI-GISEL-NEXT: s_lshl_b32 s8, s2, 12
+; VI-GISEL-NEXT: s_max_i32 s9, s9, 0
+; VI-GISEL-NEXT: s_or_b32 s8, s3, s8
+; VI-GISEL-NEXT: s_min_i32 s9, s9, 13
+; VI-GISEL-NEXT: s_bitset1_b32 s3, 12
+; VI-GISEL-NEXT: s_lshl_b32 s4, s4, 9
+; VI-GISEL-NEXT: s_lshr_b32 s10, s3, s9
+; VI-GISEL-NEXT: s_or_b32 s4, s4, 0x7c00
+; VI-GISEL-NEXT: s_lshl_b32 s9, s10, s9
+; VI-GISEL-NEXT: s_cmp_lg_u32 s9, s3
+; VI-GISEL-NEXT: s_cselect_b32 s3, 1, 0
+; VI-GISEL-NEXT: s_or_b32 s3, s10, s3
+; VI-GISEL-NEXT: s_cmp_lt_i32 s2, 1
+; VI-GISEL-NEXT: s_cselect_b32 s3, s3, s8
+; VI-GISEL-NEXT: s_and_b32 s8, s3, 7
+; VI-GISEL-NEXT: s_lshr_b32 s3, s3, 2
+; VI-GISEL-NEXT: s_cmp_eq_u32 s8, 3
+; VI-GISEL-NEXT: s_cselect_b32 s9, 1, 0
+; VI-GISEL-NEXT: s_cmp_gt_i32 s8, 5
+; VI-GISEL-NEXT: s_cselect_b32 s8, 1, 0
+; VI-GISEL-NEXT: s_or_b32 s8, s9, s8
+; VI-GISEL-NEXT: s_add_i32 s3, s3, s8
+; VI-GISEL-NEXT: s_cmp_gt_i32 s2, 30
+; VI-GISEL-NEXT: s_cselect_b32 s3, 0x7c00, s3
+; VI-GISEL-NEXT: s_cmpk_eq_i32 s2, 0x40f
+; VI-GISEL-NEXT: s_cselect_b32 s2, s4, s3
+; VI-GISEL-NEXT: s_lshr_b32 s3, s5, 16
+; VI-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
+; VI-GISEL-NEXT: s_or_b32 s2, s3, s2
+; VI-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014
+; VI-GISEL-NEXT: s_lshr_b32 s4, s7, 8
+; VI-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
+; VI-GISEL-NEXT: s_addk_i32 s3, 0xfc10
+; VI-GISEL-NEXT: s_and_b32 s4, s4, 0xffe
+; VI-GISEL-NEXT: s_or_b32 s5, s5, s6
+; VI-GISEL-NEXT: s_cmp_lg_u32 s5, 0
+; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
+; VI-GISEL-NEXT: s_or_b32 s4, s4, s5
+; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
+; VI-GISEL-NEXT: s_sub_i32 s8, 1, s3
+; VI-GISEL-NEXT: s_lshl_b32 s6, s3, 12
+; VI-GISEL-NEXT: s_max_i32 s8, s8, 0
+; VI-GISEL-NEXT: s_or_b32 s6, s4, s6
+; VI-GISEL-NEXT: s_min_i32 s8, s8, 13
+; VI-GISEL-NEXT: s_bitset1_b32 s4, 12
+; VI-GISEL-NEXT: s_lshl_b32 s5, s5, 9
+; VI-GISEL-NEXT: s_lshr_b32 s9, s4, s8
+; VI-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
+; VI-GISEL-NEXT: s_lshl_b32 s8, s9, s8
+; VI-GISEL-NEXT: s_cmp_lg_u32 s8, s4
+; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
+; VI-GISEL-NEXT: s_or_b32 s4, s9, s4
+; VI-GISEL-NEXT: s_cmp_lt_i32 s3, 1
+; VI-GISEL-NEXT: s_cselect_b32 s4, s4, s6
+; VI-GISEL-NEXT: s_and_b32 s6, s4, 7
+; VI-GISEL-NEXT: s_lshr_b32 s4, s4, 2
+; VI-GISEL-NEXT: s_cmp_eq_u32 s6, 3
+; VI-GISEL-NEXT: s_cselect_b32 s8, 1, 0
+; VI-GISEL-NEXT: s_cmp_gt_i32 s6, 5
+; VI-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; VI-GISEL-NEXT: s_or_b32 s6, s8, s6
+; VI-GISEL-NEXT: s_add_i32 s4, s4, s6
+; VI-GISEL-NEXT: s_cmp_gt_i32 s3, 30
+; VI-GISEL-NEXT: s_cselect_b32 s4, 0x7c00, s4
+; VI-GISEL-NEXT: s_cmpk_eq_i32 s3, 0x40f
+; VI-GISEL-NEXT: s_cselect_b32 s3, s5, s4
+; VI-GISEL-NEXT: s_lshr_b32 s4, s7, 16
+; VI-GISEL-NEXT: s_and_b32 s4, s4, 0x8000
+; VI-GISEL-NEXT: s_or_b32 s3, s4, s3
+; VI-GISEL-NEXT: s_and_b32 s3, s3, 0xffff
+; VI-GISEL-NEXT: s_and_b32 s2, s2, 0xffff
+; VI-GISEL-NEXT: s_lshl_b32 s3, s3, 16
+; VI-GISEL-NEXT: s_or_b32 s2, s2, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT: s_mov_b32 s2, -1
+; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-GISEL-NEXT: s_endpgm
+;
+; GFX9-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s7, s3
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: s_mov_b32 s4, s10
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s11
+; GFX9-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s8
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s9
+; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7e00
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX9-SDAG-NEXT: s_and_b32 s7, s5, 0x1ff
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s6, v1
+; GFX9-SDAG-NEXT: v_or_b32_e32 v1, s7, v2
+; GFX9-SDAG-NEXT: s_lshr_b32 s8, s5, 8
+; GFX9-SDAG-NEXT: s_bfe_u32 s9, s5, 0xb0014
+; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX9-SDAG-NEXT: s_and_b32 s7, s8, 0xffe
+; GFX9-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s9
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-SDAG-NEXT: v_med3_i32 v2, s8, 0, 13
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s8, v1
+; GFX9-SDAG-NEXT: s_or_b32 s7, s7, s8
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s10, v2
+; GFX9-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; GFX9-SDAG-NEXT: s_lshr_b32 s11, s8, s10
+; GFX9-SDAG-NEXT: s_lshl_b32 s10, s11, s10
+; GFX9-SDAG-NEXT: s_cmp_lg_u32 s10, s8
+; GFX9-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; GFX9-SDAG-NEXT: s_addk_i32 s9, 0xfc10
+; GFX9-SDAG-NEXT: s_lshl_b32 s10, s9, 12
+; GFX9-SDAG-NEXT: s_or_b32 s8, s11, s8
+; GFX9-SDAG-NEXT: s_or_b32 s10, s7, s10
+; GFX9-SDAG-NEXT: s_cmp_lt_i32 s9, 1
+; GFX9-SDAG-NEXT: s_cselect_b32 s8, s8, s10
+; GFX9-SDAG-NEXT: s_and_b32 s10, s8, 7
+; GFX9-SDAG-NEXT: s_cmp_gt_i32 s10, 5
+; GFX9-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; GFX9-SDAG-NEXT: s_cmp_eq_u32 s10, 3
+; GFX9-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; GFX9-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; GFX9-SDAG-NEXT: s_or_b32 s10, s10, s11
+; GFX9-SDAG-NEXT: s_add_i32 s8, s8, s10
+; GFX9-SDAG-NEXT: s_cmp_lt_i32 s9, 31
+; GFX9-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; GFX9-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; GFX9-SDAG-NEXT: s_cselect_b32 s7, s4, 0x7c00
+; GFX9-SDAG-NEXT: s_cmpk_eq_i32 s9, 0x40f
+; GFX9-SDAG-NEXT: s_cselect_b32 s7, s7, s8
+; GFX9-SDAG-NEXT: s_and_b32 s8, s6, 0x1ff
+; GFX9-SDAG-NEXT: v_or_b32_e32 v0, s8, v0
+; GFX9-SDAG-NEXT: s_lshr_b32 s5, s5, 16
+; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-SDAG-NEXT: s_lshr_b32 s9, s6, 8
+; GFX9-SDAG-NEXT: s_bfe_u32 s10, s6, 0xb0014
+; GFX9-SDAG-NEXT: s_and_b32 s5, s5, 0x8000
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-SDAG-NEXT: s_and_b32 s8, s9, 0xffe
+; GFX9-SDAG-NEXT: s_sub_i32 s9, 0x3f1, s10
+; GFX9-SDAG-NEXT: s_or_b32 s5, s5, s7
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; GFX9-SDAG-NEXT: v_med3_i32 v1, s9, 0, 13
+; GFX9-SDAG-NEXT: s_or_b32 s7, s8, s7
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; GFX9-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; GFX9-SDAG-NEXT: s_lshr_b32 s11, s8, s9
+; GFX9-SDAG-NEXT: s_lshl_b32 s9, s11, s9
+; GFX9-SDAG-NEXT: s_cmp_lg_u32 s9, s8
+; GFX9-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; GFX9-SDAG-NEXT: s_addk_i32 s10, 0xfc10
+; GFX9-SDAG-NEXT: s_lshl_b32 s9, s10, 12
+; GFX9-SDAG-NEXT: s_or_b32 s8, s11, s8
+; GFX9-SDAG-NEXT: s_or_b32 s9, s7, s9
+; GFX9-SDAG-NEXT: s_cmp_lt_i32 s10, 1
+; GFX9-SDAG-NEXT: s_cselect_b32 s8, s8, s9
+; GFX9-SDAG-NEXT: s_and_b32 s9, s8, 7
+; GFX9-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; GFX9-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; GFX9-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; GFX9-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; GFX9-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; GFX9-SDAG-NEXT: s_or_b32 s9, s9, s11
+; GFX9-SDAG-NEXT: s_add_i32 s8, s8, s9
+; GFX9-SDAG-NEXT: s_cmp_lt_i32 s10, 31
+; GFX9-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; GFX9-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; GFX9-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00
+; GFX9-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f
+; GFX9-SDAG-NEXT: s_cselect_b32 s4, s4, s8
+; GFX9-SDAG-NEXT: s_lshr_b32 s6, s6, 16
+; GFX9-SDAG-NEXT: s_and_b32 s6, s6, 0x8000
+; GFX9-SDAG-NEXT: s_or_b32 s4, s6, s4
+; GFX9-SDAG-NEXT: s_pack_ll_b32_b16 s4, s4, s5
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-SDAG-NEXT: s_endpgm
+;
+; GFX9-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: s_bfe_u32 s2, s5, 0xb0014
+; GFX9-GISEL-NEXT: s_lshr_b32 s3, s5, 8
+; GFX9-GISEL-NEXT: s_and_b32 s8, s5, 0x1ff
+; GFX9-GISEL-NEXT: s_addk_i32 s2, 0xfc10
+; GFX9-GISEL-NEXT: s_and_b32 s3, s3, 0xffe
+; GFX9-GISEL-NEXT: s_or_b32 s4, s8, s4
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0
+; GFX9-GISEL-NEXT: s_or_b32 s3, s3, s4
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0
+; GFX9-GISEL-NEXT: s_sub_i32 s9, 1, s2
+; GFX9-GISEL-NEXT: s_lshl_b32 s8, s2, 12
+; GFX9-GISEL-NEXT: s_max_i32 s9, s9, 0
+; GFX9-GISEL-NEXT: s_or_b32 s8, s3, s8
+; GFX9-GISEL-NEXT: s_min_i32 s9, s9, 13
+; GFX9-GISEL-NEXT: s_bitset1_b32 s3, 12
+; GFX9-GISEL-NEXT: s_lshl_b32 s4, s4, 9
+; GFX9-GISEL-NEXT: s_lshr_b32 s10, s3, s9
+; GFX9-GISEL-NEXT: s_or_b32 s4, s4, 0x7c00
+; GFX9-GISEL-NEXT: s_lshl_b32 s9, s10, s9
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s9, s3
+; GFX9-GISEL-NEXT: s_cselect_b32 s3, 1, 0
+; GFX9-GISEL-NEXT: s_or_b32 s3, s10, s3
+; GFX9-GISEL-NEXT: s_cmp_lt_i32 s2, 1
+; GFX9-GISEL-NEXT: s_cselect_b32 s3, s3, s8
+; GFX9-GISEL-NEXT: s_and_b32 s8, s3, 7
+; GFX9-GISEL-NEXT: s_lshr_b32 s3, s3, 2
+; GFX9-GISEL-NEXT: s_cmp_eq_u32 s8, 3
+; GFX9-GISEL-NEXT: s_cselect_b32 s9, 1, 0
+; GFX9-GISEL-NEXT: s_cmp_gt_i32 s8, 5
+; GFX9-GISEL-NEXT: s_cselect_b32 s8, 1, 0
+; GFX9-GISEL-NEXT: s_or_b32 s8, s9, s8
+; GFX9-GISEL-NEXT: s_add_i32 s3, s3, s8
+; GFX9-GISEL-NEXT: s_cmp_gt_i32 s2, 30
+; GFX9-GISEL-NEXT: s_cselect_b32 s3, 0x7c00, s3
+; GFX9-GISEL-NEXT: s_cmpk_eq_i32 s2, 0x40f
+; GFX9-GISEL-NEXT: s_cselect_b32 s2, s4, s3
+; GFX9-GISEL-NEXT: s_lshr_b32 s3, s5, 16
+; GFX9-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
+; GFX9-GISEL-NEXT: s_or_b32 s2, s3, s2
+; GFX9-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014
+; GFX9-GISEL-NEXT: s_lshr_b32 s4, s7, 8
+; GFX9-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
+; GFX9-GISEL-NEXT: s_addk_i32 s3, 0xfc10
+; GFX9-GISEL-NEXT: s_and_b32 s4, s4, 0xffe
+; GFX9-GISEL-NEXT: s_or_b32 s5, s5, s6
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s5, 0
+; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0
+; GFX9-GISEL-NEXT: s_or_b32 s4, s4, s5
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0
+; GFX9-GISEL-NEXT: s_sub_i32 s8, 1, s3
+; GFX9-GISEL-NEXT: s_lshl_b32 s6, s3, 12
+; GFX9-GISEL-NEXT: s_max_i32 s8, s8, 0
+; GFX9-GISEL-NEXT: s_or_b32 s6, s4, s6
+; GFX9-GISEL-NEXT: s_min_i32 s8, s8, 13
+; GFX9-GISEL-NEXT: s_bitset1_b32 s4, 12
+; GFX9-GISEL-NEXT: s_lshl_b32 s5, s5, 9
+; GFX9-GISEL-NEXT: s_lshr_b32 s9, s4, s8
+; GFX9-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
+; GFX9-GISEL-NEXT: s_lshl_b32 s8, s9, s8
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s8, s4
+; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0
+; GFX9-GISEL-NEXT: s_or_b32 s4, s9, s4
+; GFX9-GISEL-NEXT: s_cmp_lt_i32 s3, 1
+; GFX9-GISEL-NEXT: s_cselect_b32 s4, s4, s6
+; GFX9-GISEL-NEXT: s_and_b32 s6, s4, 7
+; GFX9-GISEL-NEXT: s_lshr_b32 s4, s4, 2
+; GFX9-GISEL-NEXT: s_cmp_eq_u32 s6, 3
+; GFX9-GISEL-NEXT: s_cselect_b32 s8, 1, 0
+; GFX9-GISEL-NEXT: s_cmp_gt_i32 s6, 5
+; GFX9-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; GFX9-GISEL-NEXT: s_or_b32 s6, s8, s6
+; GFX9-GISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX9-GISEL-NEXT: s_cmp_gt_i32 s3, 30
+; GFX9-GISEL-NEXT: s_cselect_b32 s4, 0x7c00, s4
+; GFX9-GISEL-NEXT: s_cmpk_eq_i32 s3, 0x40f
+; GFX9-GISEL-NEXT: s_cselect_b32 s3, s5, s4
+; GFX9-GISEL-NEXT: s_lshr_b32 s4, s7, 16
+; GFX9-GISEL-NEXT: s_and_b32 s4, s4, 0x8000
+; GFX9-GISEL-NEXT: s_or_b32 s3, s4, s3
+; GFX9-GISEL-NEXT: s_pack_ll_b32_b16 s2, s2, s3
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX950-SDAG: ; %bb.0: ; %entry
+; GFX950-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX950-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX950-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX950-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX950-SDAG-NEXT: s_mov_b32 s7, s3
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_mov_b32 s4, s10
+; GFX950-SDAG-NEXT: s_mov_b32 s5, s11
+; GFX950-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX950-SDAG-NEXT: s_mov_b32 s0, s8
+; GFX950-SDAG-NEXT: s_mov_b32 s1, s9
+; GFX950-SDAG-NEXT: s_movk_i32 s4, 0x7e00
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX950-SDAG-NEXT: s_and_b32 s7, s5, 0x1ff
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s6, v1
+; GFX950-SDAG-NEXT: v_or_b32_e32 v1, s7, v2
+; GFX950-SDAG-NEXT: s_lshr_b32 s8, s5, 8
+; GFX950-SDAG-NEXT: s_bfe_u32 s9, s5, 0xb0014
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX950-SDAG-NEXT: s_and_b32 s7, s8, 0xffe
+; GFX950-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s9
+; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX950-SDAG-NEXT: v_med3_i32 v2, s8, 0, 13
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s8, v1
+; GFX950-SDAG-NEXT: s_or_b32 s7, s7, s8
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s10, v2
+; GFX950-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; GFX950-SDAG-NEXT: s_lshr_b32 s11, s8, s10
+; GFX950-SDAG-NEXT: s_lshl_b32 s10, s11, s10
+; GFX950-SDAG-NEXT: s_cmp_lg_u32 s10, s8
+; GFX950-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; GFX950-SDAG-NEXT: s_addk_i32 s9, 0xfc10
+; GFX950-SDAG-NEXT: s_lshl_b32 s10, s9, 12
+; GFX950-SDAG-NEXT: s_or_b32 s8, s11, s8
+; GFX950-SDAG-NEXT: s_or_b32 s10, s7, s10
+; GFX950-SDAG-NEXT: s_cmp_lt_i32 s9, 1
+; GFX950-SDAG-NEXT: s_cselect_b32 s8, s8, s10
+; GFX950-SDAG-NEXT: s_and_b32 s10, s8, 7
+; GFX950-SDAG-NEXT: s_cmp_gt_i32 s10, 5
+; GFX950-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; GFX950-SDAG-NEXT: s_cmp_eq_u32 s10, 3
+; GFX950-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; GFX950-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; GFX950-SDAG-NEXT: s_or_b32 s10, s10, s11
+; GFX950-SDAG-NEXT: s_add_i32 s8, s8, s10
+; GFX950-SDAG-NEXT: s_cmp_lt_i32 s9, 31
+; GFX950-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; GFX950-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; GFX950-SDAG-NEXT: s_cselect_b32 s7, s4, 0x7c00
+; GFX950-SDAG-NEXT: s_cmpk_eq_i32 s9, 0x40f
+; GFX950-SDAG-NEXT: s_cselect_b32 s7, s7, s8
+; GFX950-SDAG-NEXT: s_and_b32 s8, s6, 0x1ff
+; GFX950-SDAG-NEXT: v_or_b32_e32 v0, s8, v0
+; GFX950-SDAG-NEXT: s_lshr_b32 s5, s5, 16
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX950-SDAG-NEXT: s_lshr_b32 s9, s6, 8
+; GFX950-SDAG-NEXT: s_bfe_u32 s10, s6, 0xb0014
+; GFX950-SDAG-NEXT: s_and_b32 s5, s5, 0x8000
+; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-SDAG-NEXT: s_and_b32 s8, s9, 0xffe
+; GFX950-SDAG-NEXT: s_sub_i32 s9, 0x3f1, s10
+; GFX950-SDAG-NEXT: s_or_b32 s5, s5, s7
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; GFX950-SDAG-NEXT: v_med3_i32 v1, s9, 0, 13
+; GFX950-SDAG-NEXT: s_or_b32 s7, s8, s7
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; GFX950-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; GFX950-SDAG-NEXT: s_lshr_b32 s11, s8, s9
+; GFX950-SDAG-NEXT: s_lshl_b32 s9, s11, s9
+; GFX950-SDAG-NEXT: s_cmp_lg_u32 s9, s8
+; GFX950-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; GFX950-SDAG-NEXT: s_addk_i32 s10, 0xfc10
+; GFX950-SDAG-NEXT: s_lshl_b32 s9, s10, 12
+; GFX950-SDAG-NEXT: s_or_b32 s8, s11, s8
+; GFX950-SDAG-NEXT: s_or_b32 s9, s7, s9
+; GFX950-SDAG-NEXT: s_cmp_lt_i32 s10, 1
+; GFX950-SDAG-NEXT: s_cselect_b32 s8, s8, s9
+; GFX950-SDAG-NEXT: s_and_b32 s9, s8, 7
+; GFX950-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; GFX950-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; GFX950-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; GFX950-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; GFX950-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; GFX950-SDAG-NEXT: s_or_b32 s9, s9, s11
+; GFX950-SDAG-NEXT: s_add_i32 s8, s8, s9
+; GFX950-SDAG-NEXT: s_cmp_lt_i32 s10, 31
+; GFX950-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; GFX950-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; GFX950-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00
+; GFX950-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f
+; GFX950-SDAG-NEXT: s_cselect_b32 s4, s4, s8
+; GFX950-SDAG-NEXT: s_lshr_b32 s6, s6, 16
+; GFX950-SDAG-NEXT: s_and_b32 s6, s6, 0x8000
+; GFX950-SDAG-NEXT: s_or_b32 s4, s6, s4
+; GFX950-SDAG-NEXT: s_pack_ll_b32_b16 s4, s4, s5
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX950-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX950-GISEL: ; %bb.0: ; %entry
+; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_bfe_u32 s2, s5, 0xb0014
+; GFX950-GISEL-NEXT: s_lshr_b32 s3, s5, 8
+; GFX950-GISEL-NEXT: s_and_b32 s8, s5, 0x1ff
+; GFX950-GISEL-NEXT: s_addk_i32 s2, 0xfc10
+; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0xffe
+; GFX950-GISEL-NEXT: s_or_b32 s4, s8, s4
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0
+; GFX950-GISEL-NEXT: s_or_b32 s3, s3, s4
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0
+; GFX950-GISEL-NEXT: s_sub_i32 s9, 1, s2
+; GFX950-GISEL-NEXT: s_lshl_b32 s8, s2, 12
+; GFX950-GISEL-NEXT: s_max_i32 s9, s9, 0
+; GFX950-GISEL-NEXT: s_or_b32 s8, s3, s8
+; GFX950-GISEL-NEXT: s_min_i32 s9, s9, 13
+; GFX950-GISEL-NEXT: s_bitset1_b32 s3, 12
+; GFX950-GISEL-NEXT: s_lshl_b32 s4, s4, 9
+; GFX950-GISEL-NEXT: s_lshr_b32 s10, s3, s9
+; GFX950-GISEL-NEXT: s_or_b32 s4, s4, 0x7c00
+; GFX950-GISEL-NEXT: s_lshl_b32 s9, s10, s9
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s9, s3
+; GFX950-GISEL-NEXT: s_cselect_b32 s3, 1, 0
+; GFX950-GISEL-NEXT: s_or_b32 s3, s10, s3
+; GFX950-GISEL-NEXT: s_cmp_lt_i32 s2, 1
+; GFX950-GISEL-NEXT: s_cselect_b32 s3, s3, s8
+; GFX950-GISEL-NEXT: s_and_b32 s8, s3, 7
+; GFX950-GISEL-NEXT: s_lshr_b32 s3, s3, 2
+; GFX950-GISEL-NEXT: s_cmp_eq_u32 s8, 3
+; GFX950-GISEL-NEXT: s_cselect_b32 s9, 1, 0
+; GFX950-GISEL-NEXT: s_cmp_gt_i32 s8, 5
+; GFX950-GISEL-NEXT: s_cselect_b32 s8, 1, 0
+; GFX950-GISEL-NEXT: s_or_b32 s8, s9, s8
+; GFX950-GISEL-NEXT: s_add_i32 s3, s3, s8
+; GFX950-GISEL-NEXT: s_cmp_gt_i32 s2, 30
+; GFX950-GISEL-NEXT: s_cselect_b32 s3, 0x7c00, s3
+; GFX950-GISEL-NEXT: s_cmpk_eq_i32 s2, 0x40f
+; GFX950-GISEL-NEXT: s_cselect_b32 s2, s4, s3
+; GFX950-GISEL-NEXT: s_lshr_b32 s3, s5, 16
+; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
+; GFX950-GISEL-NEXT: s_or_b32 s2, s3, s2
+; GFX950-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014
+; GFX950-GISEL-NEXT: s_lshr_b32 s4, s7, 8
+; GFX950-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
+; GFX950-GISEL-NEXT: s_addk_i32 s3, 0xfc10
+; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0xffe
+; GFX950-GISEL-NEXT: s_or_b32 s5, s5, s6
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s5, 0
+; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0
+; GFX950-GISEL-NEXT: s_or_b32 s4, s4, s5
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0
+; GFX950-GISEL-NEXT: s_sub_i32 s8, 1, s3
+; GFX950-GISEL-NEXT: s_lshl_b32 s6, s3, 12
+; GFX950-GISEL-NEXT: s_max_i32 s8, s8, 0
+; GFX950-GISEL-NEXT: s_or_b32 s6, s4, s6
+; GFX950-GISEL-NEXT: s_min_i32 s8, s8, 13
+; GFX950-GISEL-NEXT: s_bitset1_b32 s4, 12
+; GFX950-GISEL-NEXT: s_lshl_b32 s5, s5, 9
+; GFX950-GISEL-NEXT: s_lshr_b32 s9, s4, s8
+; GFX950-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
+; GFX950-GISEL-NEXT: s_lshl_b32 s8, s9, s8
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s8, s4
+; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0
+; GFX950-GISEL-NEXT: s_or_b32 s4, s9, s4
+; GFX950-GISEL-NEXT: s_cmp_lt_i32 s3, 1
+; GFX950-GISEL-NEXT: s_cselect_b32 s4, s4, s6
+; GFX950-GISEL-NEXT: s_and_b32 s6, s4, 7
+; GFX950-GISEL-NEXT: s_lshr_b32 s4, s4, 2
+; GFX950-GISEL-NEXT: s_cmp_eq_u32 s6, 3
+; GFX950-GISEL-NEXT: s_cselect_b32 s8, 1, 0
+; GFX950-GISEL-NEXT: s_cmp_gt_i32 s6, 5
+; GFX950-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; GFX950-GISEL-NEXT: s_or_b32 s6, s8, s6
+; GFX950-GISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX950-GISEL-NEXT: s_cmp_gt_i32 s3, 30
+; GFX950-GISEL-NEXT: s_cselect_b32 s4, 0x7c00, s4
+; GFX950-GISEL-NEXT: s_cmpk_eq_i32 s3, 0x40f
+; GFX950-GISEL-NEXT: s_cselect_b32 s3, s5, s4
+; GFX950-GISEL-NEXT: s_lshr_b32 s4, s7, 16
+; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0x8000
+; GFX950-GISEL-NEXT: s_or_b32 s3, s4, s3
+; GFX950-GISEL-NEXT: s_pack_ll_b32_b16 s2, s2, s3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX950-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX950-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX950-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-TRUE16-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-SDAG-TRUE16-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s2, v3
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s3, s2, 0x1ff
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s5, s2, 8
+; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, s3, v2
+; GFX11-SDAG-TRUE16-NEXT: s_bfe_u32 s3, s2, 0xb0014
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-SDAG-TRUE16-NEXT: s_sub_i32 s4, 0x3f1, s3
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, s4, 0, 13
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s8, v3
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s4, v2
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, s4, 0x1000
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s9, s5, s8
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s8, s9, s8
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s8, s5
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_addk_i32 s3, 0xfc10
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, s9, s5
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s8, s3, 12
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s8, s4, s8
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s3, 1
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, s5, s8
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s8, s5, 7
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_gt_i32 s8, 5
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s9, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_eq_u32 s8, 3
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s8, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s5, s5, 2
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_add_i32 s5, s5, s8
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s3, 31
+; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s8, 0x7e00
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s9, s8, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT: s_cmpk_eq_i32 s3, 0x40f
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s9, s5
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s5, s4, 0x1ff
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s10, s4, 8
+; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, s5, v0
+; GFX11-SDAG-TRUE16-NEXT: s_bfe_u32 s5, s4, 0xb0014
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s10, s10, 0xffe
+; GFX11-SDAG-TRUE16-NEXT: s_sub_i32 s9, 0x3f1, s5
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v1, s9, 0, 13
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s2, s2, 0x8000
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s11, v1
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s9, v0
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s9, s10, s9
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s10, s9, 0x1000
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s12, s10, s11
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s11, s12, s11
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s11, s10
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_addk_i32 s5, 0xfc10
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s3, s12, s3
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s10, s5, 12
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s10, s9, s10
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s5, 1
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s3, s10
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s10, s3, 7
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_gt_i32 s10, 5
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s11, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_eq_u32 s10, 3
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s10, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s3, s3, 2
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s10, s10, s11
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_add_i32 s3, s3, s10
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s5, 31
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s3, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s9, 0
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT: s_cmpk_eq_i32 s5, 0x40f
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s8, s3
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s4, s4, 16
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s4, s4, 0x8000
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s3, s4, s3
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s3, s2
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SDAG-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-SDAG-FAKE16-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s2, v3
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s3, s2, 0x1ff
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s5, s2, 8
+; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v2, s3, v2
+; GFX11-SDAG-FAKE16-NEXT: s_bfe_u32 s3, s2, 0xb0014
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-SDAG-FAKE16-NEXT: s_sub_i32 s4, 0x3f1, s3
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v3, s4, 0, 13
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s8, v3
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s4, v2
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s5, s4, 0x1000
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s9, s5, s8
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s8, s9, s8
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s8, s5
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_addk_i32 s3, 0xfc10
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s5, s9, s5
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s8, s3, 12
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s8, s4, s8
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s3, 1
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, s5, s8
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s8, s5, 7
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_gt_i32 s8, 5
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s9, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_eq_u32 s8, 3
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s8, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s5, s5, 2
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_add_i32 s5, s5, s8
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s3, 31
+; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s8, 0x7e00
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s9, s8, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT: s_cmpk_eq_i32 s3, 0x40f
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s9, s5
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s5, s4, 0x1ff
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s10, s4, 8
+; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, s5, v0
+; GFX11-SDAG-FAKE16-NEXT: s_bfe_u32 s5, s4, 0xb0014
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s10, s10, 0xffe
+; GFX11-SDAG-FAKE16-NEXT: s_sub_i32 s9, 0x3f1, s5
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v1, s9, 0, 13
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s2, s2, 0x8000
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s11, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s9, v0
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s9, s10, s9
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s10, s9, 0x1000
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s12, s10, s11
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s11, s12, s11
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s11, s10
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_addk_i32 s5, 0xfc10
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s3, s12, s3
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s10, s5, 12
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s10, s9, s10
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s5, 1
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s3, s10
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s10, s3, 7
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_gt_i32 s10, 5
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s11, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_eq_u32 s10, 3
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s10, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s3, s3, 2
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s10, s10, s11
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_add_i32 s3, s3, s10
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s5, 31
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s3, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s9, 0
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT: s_cmpk_eq_i32 s5, 0x40f
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s8, s3
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s4, s4, 16
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s4, s4, 0x8000
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s3, s4, s3
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s3, s2
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SDAG-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-FAKE16-NEXT: s_endpgm
+;
+; GFX11-GISEL-TRUE16-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff
+; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
+; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 8
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4
+; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s2, 0xfc10
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0xffe
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s4
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s8, 1, s2
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s10, s3, 0x1000
+; GFX11-GISEL-TRUE16-NEXT: s_max_i32 s8, s8, 0
+; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s9, s2, 12
+; GFX11-GISEL-TRUE16-NEXT: s_min_i32 s8, s8, 13
+; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s4, s4, 9
+; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s11, s10, s8
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s9
+; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s8, s11, s8
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s4, 0x7c00
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s8, s10
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s8, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s8, s11, s8
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lt_i32 s2, 1
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, s8, s3
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s3, 7
+; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s3, 2
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_eq_u32 s8, 3
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s9, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_gt_i32 s8, 5
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s8, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s8, s9, s8
+; GFX11-GISEL-TRUE16-NEXT: s_add_i32 s3, s3, s8
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_gt_i32 s2, 30
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 0x7c00, s3
+; GFX11-GISEL-TRUE16-NEXT: s_cmpk_eq_i32 s2, 0x40f
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s4, s3
+; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 16
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff
+; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
+; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s7, 8
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6
+; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, 0
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s5, s3
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s9, s3, 0x1000
+; GFX11-GISEL-TRUE16-NEXT: s_max_i32 s6, s6, 0
+; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s8, s4, 12
+; GFX11-GISEL-TRUE16-NEXT: s_min_i32 s6, s6, 13
+; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s5, s5, 9
+; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s10, s9, s6
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s8
+; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s6, s10, s6
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s5, s5, 0x7c00
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, s9
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s10, s6
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lt_i32 s4, 1
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, s6, s3
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 7
+; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s3, 2
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_eq_u32 s6, 3
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s8, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_gt_i32 s6, 5
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6
+; GFX11-GISEL-TRUE16-NEXT: s_add_i32 s3, s3, s6
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_gt_i32 s4, 30
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 0x7c00, s3
+; GFX11-GISEL-TRUE16-NEXT: s_cmpk_eq_i32 s4, 0x40f
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, s5, s3
+; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s4, s7, 16
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s4, s4, 0x8000
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s4, s3
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s3
+; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX11-GISEL-FAKE16-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff
+; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
+; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 8
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4
+; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s2, 0xfc10
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0xffe
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s4
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s8, 1, s2
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s10, s3, 0x1000
+; GFX11-GISEL-FAKE16-NEXT: s_max_i32 s8, s8, 0
+; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s9, s2, 12
+; GFX11-GISEL-FAKE16-NEXT: s_min_i32 s8, s8, 13
+; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s4, s4, 9
+; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s11, s10, s8
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s9
+; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s8, s11, s8
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s4, 0x7c00
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s8, s10
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s8, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s8, s11, s8
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lt_i32 s2, 1
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, s8, s3
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s3, 7
+; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s3, 2
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_eq_u32 s8, 3
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s9, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_gt_i32 s8, 5
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s8, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s8, s9, s8
+; GFX11-GISEL-FAKE16-NEXT: s_add_i32 s3, s3, s8
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_gt_i32 s2, 30
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 0x7c00, s3
+; GFX11-GISEL-FAKE16-NEXT: s_cmpk_eq_i32 s2, 0x40f
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s4, s3
+; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 16
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff
+; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
+; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s7, 8
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6
+; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, 0
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s5, s3
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s9, s3, 0x1000
+; GFX11-GISEL-FAKE16-NEXT: s_max_i32 s6, s6, 0
+; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s8, s4, 12
+; GFX11-GISEL-FAKE16-NEXT: s_min_i32 s6, s6, 13
+; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s5, s5, 9
+; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s10, s9, s6
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s8
+; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s6, s10, s6
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s5, s5, 0x7c00
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, s9
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s10, s6
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lt_i32 s4, 1
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, s6, s3
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 7
+; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s3, 2
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_eq_u32 s6, 3
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s8, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_gt_i32 s6, 5
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6
+; GFX11-GISEL-FAKE16-NEXT: s_add_i32 s3, s3, s6
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_gt_i32 s4, 30
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 0x7c00, s3
+; GFX11-GISEL-FAKE16-NEXT: s_cmpk_eq_i32 s4, 0x40f
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, s5, s3
+; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s4, s7, 16
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s4, s4, 0x8000
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s4, s3
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s3
+; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-FAKE16-NEXT: s_endpgm
+ ptr addrspace(1) %r,
+ ptr addrspace(1) %a) {
+entry:
+ %a.val = load <2 x double>, ptr addrspace(1) %a
+ %r.val = fptrunc <2 x double> %a.val to <2 x half>
+ store <2 x half> %r.val, ptr addrspace(1) %r
+ ret void
+}
+
+define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn(
+; SI-SDAG-LABEL: fptrunc_v2f64_to_v2f16_afn:
+; SI-SDAG: ; %bb.0: ; %entry
+; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT: s_mov_b32 s2, -1
+; SI-SDAG-NEXT: s_mov_b32 s10, s2
+; SI-SDAG-NEXT: s_mov_b32 s11, s3
+; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT: s_mov_b32 s8, s6
+; SI-SDAG-NEXT: s_mov_b32 s9, s7
+; SI-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SI-SDAG-NEXT: s_movk_i32 s0, 0x7e00
+; SI-SDAG-NEXT: s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; SI-SDAG-NEXT: v_readfirstlane_b32 s6, v1
+; SI-SDAG-NEXT: s_and_b32 s7, s1, 0x1ff
+; SI-SDAG-NEXT: s_lshr_b32 s8, s1, 8
+; SI-SDAG-NEXT: s_bfe_u32 s9, s1, 0xb0014
+; SI-SDAG-NEXT: v_or_b32_e32 v1, s7, v2
+; SI-SDAG-NEXT: s_and_b32 s7, s8, 0xffe
+; SI-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s9
+; SI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-SDAG-NEXT: v_med3_i32 v2, s8, 0, 13
+; SI-SDAG-NEXT: v_readfirstlane_b32 s8, v1
+; SI-SDAG-NEXT: v_readfirstlane_b32 s10, v2
+; SI-SDAG-NEXT: s_or_b32 s7, s7, s8
+; SI-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; SI-SDAG-NEXT: s_lshr_b32 s11, s8, s10
+; SI-SDAG-NEXT: s_lshl_b32 s10, s11, s10
+; SI-SDAG-NEXT: s_cmp_lg_u32 s10, s8
+; SI-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; SI-SDAG-NEXT: s_addk_i32 s9, 0xfc10
+; SI-SDAG-NEXT: s_or_b32 s8, s11, s8
+; SI-SDAG-NEXT: s_lshl_b32 s10, s9, 12
+; SI-SDAG-NEXT: s_or_b32 s10, s7, s10
+; SI-SDAG-NEXT: s_cmp_lt_i32 s9, 1
+; SI-SDAG-NEXT: s_cselect_b32 s8, s8, s10
+; SI-SDAG-NEXT: s_and_b32 s10, s8, 7
+; SI-SDAG-NEXT: s_cmp_gt_i32 s10, 5
+; SI-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; SI-SDAG-NEXT: s_cmp_eq_u32 s10, 3
+; SI-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; SI-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; SI-SDAG-NEXT: s_or_b32 s10, s10, s11
+; SI-SDAG-NEXT: s_add_i32 s8, s8, s10
+; SI-SDAG-NEXT: s_cmp_lt_i32 s9, 31
+; SI-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; SI-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; SI-SDAG-NEXT: s_cselect_b32 s7, s0, 0x7c00
+; SI-SDAG-NEXT: s_cmpk_eq_i32 s9, 0x40f
+; SI-SDAG-NEXT: s_cselect_b32 s7, s7, s8
+; SI-SDAG-NEXT: s_lshr_b32 s1, s1, 16
+; SI-SDAG-NEXT: s_and_b32 s8, s6, 0x1ff
+; SI-SDAG-NEXT: s_lshr_b32 s9, s6, 8
+; SI-SDAG-NEXT: s_bfe_u32 s10, s6, 0xb0014
+; SI-SDAG-NEXT: s_and_b32 s1, s1, 0x8000
+; SI-SDAG-NEXT: v_or_b32_e32 v0, s8, v0
+; SI-SDAG-NEXT: s_and_b32 s8, s9, 0xffe
+; SI-SDAG-NEXT: s_sub_i32 s9, 0x3f1, s10
+; SI-SDAG-NEXT: s_or_b32 s1, s1, s7
+; SI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-SDAG-NEXT: v_med3_i32 v1, s9, 0, 13
+; SI-SDAG-NEXT: s_lshl_b32 s1, s1, 16
+; SI-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; SI-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; SI-SDAG-NEXT: s_or_b32 s7, s8, s7
+; SI-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; SI-SDAG-NEXT: s_lshr_b32 s11, s8, s9
+; SI-SDAG-NEXT: s_lshl_b32 s9, s11, s9
+; SI-SDAG-NEXT: s_cmp_lg_u32 s9, s8
+; SI-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; SI-SDAG-NEXT: s_addk_i32 s10, 0xfc10
+; SI-SDAG-NEXT: s_or_b32 s8, s11, s8
+; SI-SDAG-NEXT: s_lshl_b32 s9, s10, 12
+; SI-SDAG-NEXT: s_or_b32 s9, s7, s9
+; SI-SDAG-NEXT: s_cmp_lt_i32 s10, 1
+; SI-SDAG-NEXT: s_cselect_b32 s8, s8, s9
+; SI-SDAG-NEXT: s_and_b32 s9, s8, 7
+; SI-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; SI-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; SI-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; SI-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; SI-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; SI-SDAG-NEXT: s_or_b32 s9, s9, s11
+; SI-SDAG-NEXT: s_add_i32 s8, s8, s9
+; SI-SDAG-NEXT: s_cmp_lt_i32 s10, 31
+; SI-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; SI-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; SI-SDAG-NEXT: s_cselect_b32 s0, s0, 0x7c00
+; SI-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f
+; SI-SDAG-NEXT: s_cselect_b32 s0, s0, s8
+; SI-SDAG-NEXT: s_lshr_b32 s6, s6, 16
+; SI-SDAG-NEXT: s_and_b32 s6, s6, 0x8000
+; SI-SDAG-NEXT: s_or_b32 s0, s6, s0
+; SI-SDAG-NEXT: s_and_b32 s0, s0, 0xffff
+; SI-SDAG-NEXT: s_or_b32 s6, s0, s1
+; SI-SDAG-NEXT: s_mov_b32 s0, s4
+; SI-SDAG-NEXT: s_mov_b32 s1, s5
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; SI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-SDAG-NEXT: s_endpgm
+;
+; SI-GISEL-LABEL: fptrunc_v2f64_to_v2f16_afn:
+; SI-GISEL: ; %bb.0: ; %entry
+; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; SI-GISEL-NEXT: s_mov_b32 s2, -1
+; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; SI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
@@ -664,29 +3145,111 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-GISEL-NEXT: s_endpgm
;
-; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
+; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f16_afn:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
+; VI-SDAG-NEXT: s_movk_i32 s6, 0x7e00
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
-; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
-; VI-SDAG-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; VI-SDAG-NEXT: s_and_b32 s7, s4, 0x1ff
+; VI-SDAG-NEXT: v_readfirstlane_b32 s5, v1
+; VI-SDAG-NEXT: v_or_b32_e32 v1, s7, v2
+; VI-SDAG-NEXT: s_lshr_b32 s8, s4, 8
+; VI-SDAG-NEXT: s_bfe_u32 s9, s4, 0xb0014
+; VI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; VI-SDAG-NEXT: s_and_b32 s7, s8, 0xffe
+; VI-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s9
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-SDAG-NEXT: v_med3_i32 v2, s8, 0, 13
+; VI-SDAG-NEXT: v_readfirstlane_b32 s8, v1
+; VI-SDAG-NEXT: s_or_b32 s7, s7, s8
+; VI-SDAG-NEXT: v_readfirstlane_b32 s10, v2
+; VI-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; VI-SDAG-NEXT: s_lshr_b32 s11, s8, s10
+; VI-SDAG-NEXT: s_lshl_b32 s10, s11, s10
+; VI-SDAG-NEXT: s_cmp_lg_u32 s10, s8
+; VI-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; VI-SDAG-NEXT: s_addk_i32 s9, 0xfc10
+; VI-SDAG-NEXT: s_lshl_b32 s10, s9, 12
+; VI-SDAG-NEXT: s_or_b32 s8, s11, s8
+; VI-SDAG-NEXT: s_or_b32 s10, s7, s10
+; VI-SDAG-NEXT: s_cmp_lt_i32 s9, 1
+; VI-SDAG-NEXT: s_cselect_b32 s8, s8, s10
+; VI-SDAG-NEXT: s_and_b32 s10, s8, 7
+; VI-SDAG-NEXT: s_cmp_gt_i32 s10, 5
+; VI-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; VI-SDAG-NEXT: s_cmp_eq_u32 s10, 3
+; VI-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; VI-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; VI-SDAG-NEXT: s_or_b32 s10, s10, s11
+; VI-SDAG-NEXT: s_add_i32 s8, s8, s10
+; VI-SDAG-NEXT: s_cmp_lt_i32 s9, 31
+; VI-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; VI-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; VI-SDAG-NEXT: s_cselect_b32 s7, s6, 0x7c00
+; VI-SDAG-NEXT: s_cmpk_eq_i32 s9, 0x40f
+; VI-SDAG-NEXT: s_cselect_b32 s7, s7, s8
+; VI-SDAG-NEXT: s_and_b32 s8, s5, 0x1ff
+; VI-SDAG-NEXT: v_or_b32_e32 v0, s8, v0
+; VI-SDAG-NEXT: s_lshr_b32 s4, s4, 16
+; VI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; VI-SDAG-NEXT: s_lshr_b32 s9, s5, 8
+; VI-SDAG-NEXT: s_bfe_u32 s10, s5, 0xb0014
+; VI-SDAG-NEXT: s_and_b32 s4, s4, 0x8000
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-SDAG-NEXT: s_and_b32 s8, s9, 0xffe
+; VI-SDAG-NEXT: s_sub_i32 s9, 0x3f1, s10
+; VI-SDAG-NEXT: s_or_b32 s4, s4, s7
+; VI-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; VI-SDAG-NEXT: v_med3_i32 v1, s9, 0, 13
+; VI-SDAG-NEXT: s_or_b32 s7, s8, s7
+; VI-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; VI-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; VI-SDAG-NEXT: s_lshr_b32 s11, s8, s9
+; VI-SDAG-NEXT: s_lshl_b32 s4, s4, 16
+; VI-SDAG-NEXT: s_lshl_b32 s9, s11, s9
+; VI-SDAG-NEXT: s_cmp_lg_u32 s9, s8
+; VI-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; VI-SDAG-NEXT: s_addk_i32 s10, 0xfc10
+; VI-SDAG-NEXT: s_lshl_b32 s9, s10, 12
+; VI-SDAG-NEXT: s_or_b32 s8, s11, s8
+; VI-SDAG-NEXT: s_or_b32 s9, s7, s9
+; VI-SDAG-NEXT: s_cmp_lt_i32 s10, 1
+; VI-SDAG-NEXT: s_cselect_b32 s8, s8, s9
+; VI-SDAG-NEXT: s_and_b32 s9, s8, 7
+; VI-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; VI-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; VI-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; VI-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; VI-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; VI-SDAG-NEXT: s_or_b32 s9, s9, s11
+; VI-SDAG-NEXT: s_add_i32 s8, s8, s9
+; VI-SDAG-NEXT: s_cmp_lt_i32 s10, 31
+; VI-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; VI-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; VI-SDAG-NEXT: s_cselect_b32 s6, s6, 0x7c00
+; VI-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f
+; VI-SDAG-NEXT: s_cselect_b32 s6, s6, s8
+; VI-SDAG-NEXT: s_lshr_b32 s5, s5, 16
+; VI-SDAG-NEXT: s_and_b32 s5, s5, 0x8000
+; VI-SDAG-NEXT: s_or_b32 s5, s5, s6
+; VI-SDAG-NEXT: s_and_b32 s5, s5, 0xffff
+; VI-SDAG-NEXT: s_or_b32 s4, s5, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
-; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
+; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f16_afn:
; VI-GISEL: ; %bb.0: ; %entry
; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -702,29 +3265,109 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-GISEL-NEXT: s_endpgm
;
-; GFX9-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX9-SDAG-LABEL: fptrunc_v2f64_to_v2f16_afn:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s7, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX9-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s4, s10
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s11
+; GFX9-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s8
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s9
+; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7e00
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
-; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v2
-; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX9-SDAG-NEXT: s_and_b32 s7, s5, 0x1ff
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s6, v1
+; GFX9-SDAG-NEXT: v_or_b32_e32 v1, s7, v2
+; GFX9-SDAG-NEXT: s_lshr_b32 s8, s5, 8
+; GFX9-SDAG-NEXT: s_bfe_u32 s9, s5, 0xb0014
+; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX9-SDAG-NEXT: s_and_b32 s7, s8, 0xffe
+; GFX9-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s9
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-SDAG-NEXT: v_med3_i32 v2, s8, 0, 13
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s8, v1
+; GFX9-SDAG-NEXT: s_or_b32 s7, s7, s8
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s10, v2
+; GFX9-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; GFX9-SDAG-NEXT: s_lshr_b32 s11, s8, s10
+; GFX9-SDAG-NEXT: s_lshl_b32 s10, s11, s10
+; GFX9-SDAG-NEXT: s_cmp_lg_u32 s10, s8
+; GFX9-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; GFX9-SDAG-NEXT: s_addk_i32 s9, 0xfc10
+; GFX9-SDAG-NEXT: s_lshl_b32 s10, s9, 12
+; GFX9-SDAG-NEXT: s_or_b32 s8, s11, s8
+; GFX9-SDAG-NEXT: s_or_b32 s10, s7, s10
+; GFX9-SDAG-NEXT: s_cmp_lt_i32 s9, 1
+; GFX9-SDAG-NEXT: s_cselect_b32 s8, s8, s10
+; GFX9-SDAG-NEXT: s_and_b32 s10, s8, 7
+; GFX9-SDAG-NEXT: s_cmp_gt_i32 s10, 5
+; GFX9-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; GFX9-SDAG-NEXT: s_cmp_eq_u32 s10, 3
+; GFX9-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; GFX9-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; GFX9-SDAG-NEXT: s_or_b32 s10, s10, s11
+; GFX9-SDAG-NEXT: s_add_i32 s8, s8, s10
+; GFX9-SDAG-NEXT: s_cmp_lt_i32 s9, 31
+; GFX9-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; GFX9-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; GFX9-SDAG-NEXT: s_cselect_b32 s7, s4, 0x7c00
+; GFX9-SDAG-NEXT: s_cmpk_eq_i32 s9, 0x40f
+; GFX9-SDAG-NEXT: s_cselect_b32 s7, s7, s8
+; GFX9-SDAG-NEXT: s_and_b32 s8, s6, 0x1ff
+; GFX9-SDAG-NEXT: v_or_b32_e32 v0, s8, v0
+; GFX9-SDAG-NEXT: s_lshr_b32 s5, s5, 16
+; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-SDAG-NEXT: s_lshr_b32 s9, s6, 8
+; GFX9-SDAG-NEXT: s_bfe_u32 s10, s6, 0xb0014
+; GFX9-SDAG-NEXT: s_and_b32 s5, s5, 0x8000
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-SDAG-NEXT: s_and_b32 s8, s9, 0xffe
+; GFX9-SDAG-NEXT: s_sub_i32 s9, 0x3f1, s10
+; GFX9-SDAG-NEXT: s_or_b32 s5, s5, s7
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; GFX9-SDAG-NEXT: v_med3_i32 v1, s9, 0, 13
+; GFX9-SDAG-NEXT: s_or_b32 s7, s8, s7
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; GFX9-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; GFX9-SDAG-NEXT: s_lshr_b32 s11, s8, s9
+; GFX9-SDAG-NEXT: s_lshl_b32 s9, s11, s9
+; GFX9-SDAG-NEXT: s_cmp_lg_u32 s9, s8
+; GFX9-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; GFX9-SDAG-NEXT: s_addk_i32 s10, 0xfc10
+; GFX9-SDAG-NEXT: s_lshl_b32 s9, s10, 12
+; GFX9-SDAG-NEXT: s_or_b32 s8, s11, s8
+; GFX9-SDAG-NEXT: s_or_b32 s9, s7, s9
+; GFX9-SDAG-NEXT: s_cmp_lt_i32 s10, 1
+; GFX9-SDAG-NEXT: s_cselect_b32 s8, s8, s9
+; GFX9-SDAG-NEXT: s_and_b32 s9, s8, 7
+; GFX9-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; GFX9-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; GFX9-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; GFX9-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; GFX9-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; GFX9-SDAG-NEXT: s_or_b32 s9, s9, s11
+; GFX9-SDAG-NEXT: s_add_i32 s8, s8, s9
+; GFX9-SDAG-NEXT: s_cmp_lt_i32 s10, 31
+; GFX9-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; GFX9-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; GFX9-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00
+; GFX9-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f
+; GFX9-SDAG-NEXT: s_cselect_b32 s4, s4, s8
+; GFX9-SDAG-NEXT: s_lshr_b32 s6, s6, 16
+; GFX9-SDAG-NEXT: s_and_b32 s6, s6, 0x8000
+; GFX9-SDAG-NEXT: s_or_b32 s4, s6, s4
+; GFX9-SDAG-NEXT: s_pack_ll_b32_b16 s4, s4, s5
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
-; GFX9-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX9-GISEL-LABEL: fptrunc_v2f64_to_v2f16_afn:
; GFX9-GISEL: ; %bb.0: ; %entry
; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -740,27 +3383,109 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-GISEL-NEXT: s_endpgm
;
-; GFX950-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX950-SDAG-LABEL: fptrunc_v2f64_to_v2f16_afn:
; GFX950-SDAG: ; %bb.0: ; %entry
-; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX950-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX950-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX950-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX950-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX950-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX950-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX950-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX950-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX950-SDAG-NEXT: s_mov_b32 s7, s3
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX950-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX950-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX950-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX950-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX950-SDAG-NEXT: s_mov_b32 s4, s10
+; GFX950-SDAG-NEXT: s_mov_b32 s5, s11
+; GFX950-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX950-SDAG-NEXT: s_mov_b32 s0, s8
+; GFX950-SDAG-NEXT: s_mov_b32 s1, s9
+; GFX950-SDAG-NEXT: s_movk_i32 s4, 0x7e00
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX950-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
-; GFX950-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v0, v0, v2
-; GFX950-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX950-SDAG-NEXT: s_and_b32 s7, s5, 0x1ff
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s6, v1
+; GFX950-SDAG-NEXT: v_or_b32_e32 v1, s7, v2
+; GFX950-SDAG-NEXT: s_lshr_b32 s8, s5, 8
+; GFX950-SDAG-NEXT: s_bfe_u32 s9, s5, 0xb0014
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX950-SDAG-NEXT: s_and_b32 s7, s8, 0xffe
+; GFX950-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s9
+; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX950-SDAG-NEXT: v_med3_i32 v2, s8, 0, 13
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s8, v1
+; GFX950-SDAG-NEXT: s_or_b32 s7, s7, s8
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s10, v2
+; GFX950-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; GFX950-SDAG-NEXT: s_lshr_b32 s11, s8, s10
+; GFX950-SDAG-NEXT: s_lshl_b32 s10, s11, s10
+; GFX950-SDAG-NEXT: s_cmp_lg_u32 s10, s8
+; GFX950-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; GFX950-SDAG-NEXT: s_addk_i32 s9, 0xfc10
+; GFX950-SDAG-NEXT: s_lshl_b32 s10, s9, 12
+; GFX950-SDAG-NEXT: s_or_b32 s8, s11, s8
+; GFX950-SDAG-NEXT: s_or_b32 s10, s7, s10
+; GFX950-SDAG-NEXT: s_cmp_lt_i32 s9, 1
+; GFX950-SDAG-NEXT: s_cselect_b32 s8, s8, s10
+; GFX950-SDAG-NEXT: s_and_b32 s10, s8, 7
+; GFX950-SDAG-NEXT: s_cmp_gt_i32 s10, 5
+; GFX950-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; GFX950-SDAG-NEXT: s_cmp_eq_u32 s10, 3
+; GFX950-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; GFX950-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; GFX950-SDAG-NEXT: s_or_b32 s10, s10, s11
+; GFX950-SDAG-NEXT: s_add_i32 s8, s8, s10
+; GFX950-SDAG-NEXT: s_cmp_lt_i32 s9, 31
+; GFX950-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; GFX950-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; GFX950-SDAG-NEXT: s_cselect_b32 s7, s4, 0x7c00
+; GFX950-SDAG-NEXT: s_cmpk_eq_i32 s9, 0x40f
+; GFX950-SDAG-NEXT: s_cselect_b32 s7, s7, s8
+; GFX950-SDAG-NEXT: s_and_b32 s8, s6, 0x1ff
+; GFX950-SDAG-NEXT: v_or_b32_e32 v0, s8, v0
+; GFX950-SDAG-NEXT: s_lshr_b32 s5, s5, 16
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX950-SDAG-NEXT: s_lshr_b32 s9, s6, 8
+; GFX950-SDAG-NEXT: s_bfe_u32 s10, s6, 0xb0014
+; GFX950-SDAG-NEXT: s_and_b32 s5, s5, 0x8000
+; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-SDAG-NEXT: s_and_b32 s8, s9, 0xffe
+; GFX950-SDAG-NEXT: s_sub_i32 s9, 0x3f1, s10
+; GFX950-SDAG-NEXT: s_or_b32 s5, s5, s7
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; GFX950-SDAG-NEXT: v_med3_i32 v1, s9, 0, 13
+; GFX950-SDAG-NEXT: s_or_b32 s7, s8, s7
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; GFX950-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; GFX950-SDAG-NEXT: s_lshr_b32 s11, s8, s9
+; GFX950-SDAG-NEXT: s_lshl_b32 s9, s11, s9
+; GFX950-SDAG-NEXT: s_cmp_lg_u32 s9, s8
+; GFX950-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; GFX950-SDAG-NEXT: s_addk_i32 s10, 0xfc10
+; GFX950-SDAG-NEXT: s_lshl_b32 s9, s10, 12
+; GFX950-SDAG-NEXT: s_or_b32 s8, s11, s8
+; GFX950-SDAG-NEXT: s_or_b32 s9, s7, s9
+; GFX950-SDAG-NEXT: s_cmp_lt_i32 s10, 1
+; GFX950-SDAG-NEXT: s_cselect_b32 s8, s8, s9
+; GFX950-SDAG-NEXT: s_and_b32 s9, s8, 7
+; GFX950-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; GFX950-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; GFX950-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; GFX950-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; GFX950-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; GFX950-SDAG-NEXT: s_or_b32 s9, s9, s11
+; GFX950-SDAG-NEXT: s_add_i32 s8, s8, s9
+; GFX950-SDAG-NEXT: s_cmp_lt_i32 s10, 31
+; GFX950-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; GFX950-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; GFX950-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00
+; GFX950-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f
+; GFX950-SDAG-NEXT: s_cselect_b32 s4, s4, s8
+; GFX950-SDAG-NEXT: s_lshr_b32 s6, s6, 16
+; GFX950-SDAG-NEXT: s_and_b32 s6, s6, 0x8000
+; GFX950-SDAG-NEXT: s_or_b32 s4, s6, s4
+; GFX950-SDAG-NEXT: s_pack_ll_b32_b16 s4, s4, s5
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX950-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX950-SDAG-NEXT: s_endpgm
;
-; GFX950-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX950-GISEL-LABEL: fptrunc_v2f64_to_v2f16_afn:
; GFX950-GISEL: ; %bb.0: ; %entry
; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -776,7 +3501,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX950-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX950-GISEL-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX11-SDAG-TRUE16-LABEL: fptrunc_v2f64_to_v2f16_afn:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1
@@ -786,21 +3511,113 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2
; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0
; GFX11-SDAG-TRUE16-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0
-; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
-; GFX11-SDAG-TRUE16-NEXT: v_cvt_f32_f64_e32 v1, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v2
-; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s2, v3
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s3, s2, 0x1ff
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s5, s2, 8
+; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, s3, v2
+; GFX11-SDAG-TRUE16-NEXT: s_bfe_u32 s3, s2, 0xb0014
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-SDAG-TRUE16-NEXT: s_sub_i32 s4, 0x3f1, s3
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, s4, 0, 13
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s8, v3
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s4, v2
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, s4, 0x1000
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s9, s5, s8
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s8, s9, s8
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s8, s5
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_addk_i32 s3, 0xfc10
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, s9, s5
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s8, s3, 12
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s8, s4, s8
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s3, 1
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, s5, s8
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s8, s5, 7
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_gt_i32 s8, 5
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s9, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_eq_u32 s8, 3
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s8, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s5, s5, 2
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_add_i32 s5, s5, s8
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s3, 31
+; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s8, 0x7e00
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s9, s8, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT: s_cmpk_eq_i32 s3, 0x40f
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s9, s5
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s5, s4, 0x1ff
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s10, s4, 8
+; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, s5, v0
+; GFX11-SDAG-TRUE16-NEXT: s_bfe_u32 s5, s4, 0xb0014
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s10, s10, 0xffe
+; GFX11-SDAG-TRUE16-NEXT: s_sub_i32 s9, 0x3f1, s5
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v1, s9, 0, 13
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s2, s2, 0x8000
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s11, v1
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s9, v0
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s9, s10, s9
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s10, s9, 0x1000
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s12, s10, s11
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s11, s12, s11
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s11, s10
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_addk_i32 s5, 0xfc10
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s3, s12, s3
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s10, s5, 12
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s10, s9, s10
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s5, 1
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s3, s10
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s10, s3, 7
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_gt_i32 s10, 5
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s11, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_eq_u32 s10, 3
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s10, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s3, s3, 2
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s10, s10, s11
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_add_i32 s3, s3, s10
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s5, 31
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s3, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s9, 0
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT: s_cmpk_eq_i32 s5, 0x40f
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s8, s3
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s4, s4, 16
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s4, s4, 0x8000
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s3, s4, s3
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s3, s2
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, s2
; GFX11-SDAG-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-SDAG-TRUE16-NEXT: s_endpgm
;
-; GFX11-SDAG-FAKE16-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX11-SDAG-FAKE16-LABEL: fptrunc_v2f64_to_v2f16_afn:
; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
; GFX11-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1
@@ -810,21 +3627,113 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s8, s2
; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0
; GFX11-SDAG-FAKE16-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0
-; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
-; GFX11-SDAG-FAKE16-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v2
-; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s2, v3
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s3, s2, 0x1ff
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s5, s2, 8
+; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v2, s3, v2
+; GFX11-SDAG-FAKE16-NEXT: s_bfe_u32 s3, s2, 0xb0014
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-SDAG-FAKE16-NEXT: s_sub_i32 s4, 0x3f1, s3
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v3, s4, 0, 13
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s8, v3
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s4, v2
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s5, s4, 0x1000
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s9, s5, s8
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s8, s9, s8
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s8, s5
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_addk_i32 s3, 0xfc10
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s5, s9, s5
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s8, s3, 12
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s8, s4, s8
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s3, 1
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, s5, s8
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s8, s5, 7
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_gt_i32 s8, 5
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s9, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_eq_u32 s8, 3
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s8, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s5, s5, 2
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_add_i32 s5, s5, s8
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s3, 31
+; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s8, 0x7e00
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s9, s8, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT: s_cmpk_eq_i32 s3, 0x40f
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s9, s5
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s5, s4, 0x1ff
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s10, s4, 8
+; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, s5, v0
+; GFX11-SDAG-FAKE16-NEXT: s_bfe_u32 s5, s4, 0xb0014
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s10, s10, 0xffe
+; GFX11-SDAG-FAKE16-NEXT: s_sub_i32 s9, 0x3f1, s5
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v1, s9, 0, 13
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s2, s2, 0x8000
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s11, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s9, v0
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s9, s10, s9
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s10, s9, 0x1000
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s12, s10, s11
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s11, s12, s11
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s11, s10
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_addk_i32 s5, 0xfc10
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s3, s12, s3
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s10, s5, 12
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s10, s9, s10
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s5, 1
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s3, s10
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s10, s3, 7
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_gt_i32 s10, 5
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s11, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_eq_u32 s10, 3
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s10, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s3, s3, 2
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s10, s10, s11
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_add_i32 s3, s3, s10
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s5, 31
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s3, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s9, 0
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT: s_cmpk_eq_i32 s5, 0x40f
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s8, s3
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s4, s4, 16
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s4, s4, 0x8000
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s3, s4, s3
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s3, s2
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, s2
; GFX11-SDAG-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-SDAG-FAKE16-NEXT: s_endpgm
;
-; GFX11-GISEL-TRUE16-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX11-GISEL-TRUE16-LABEL: fptrunc_v2f64_to_v2f16_afn:
; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry
; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
@@ -842,7 +3751,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-GISEL-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-GISEL-TRUE16-NEXT: s_endpgm
;
-; GFX11-GISEL-FAKE16-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX11-GISEL-FAKE16-LABEL: fptrunc_v2f64_to_v2f16_afn:
; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry
; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
@@ -863,7 +3772,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
ptr addrspace(1) %a) {
entry:
%a.val = load <2 x double>, ptr addrspace(1) %a
- %r.val = fptrunc <2 x double> %a.val to <2 x half>
+ %r.val = fptrunc afn <2 x double> %a.val to <2 x half>
store <2 x half> %r.val, ptr addrspace(1) %r
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 2bd3659..4f8eab1 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -3,17 +3,15 @@
; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-SDAG,VI-SAFE-SDAG %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-GISEL,VI-SAFE-GISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI-SDAG,VI-UNSAFE-SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI-GISEL,VI-UNSAFE-GISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-SAFE-SDAG %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10-GISEL,GFX10-SAFE-GISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-UNSAFE-SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=1 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX10-GISEL,GFX10-UNSAFE-GISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-SAFE-SDAG %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-SAFE-GISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-FAKE16 %s
define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) {
; SI-LABEL: fptrunc_f64_to_f32:
@@ -94,6 +92,85 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in)
ret void
}
+define amdgpu_kernel void @fptrunc_f64_to_f32_afn(ptr addrspace(1) %out, double %in) {
+; SI-LABEL: fptrunc_f64_to_f32_afn:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-SDAG-LABEL: fptrunc_f64_to_f32_afn:
+; VI-SDAG: ; %bb.0:
+; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s6, -1
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; VI-SDAG-NEXT: s_mov_b32 s4, s0
+; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-SDAG-NEXT: s_endpgm
+;
+; VI-GISEL-LABEL: fptrunc_f64_to_f32_afn:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; VI-GISEL-NEXT: s_mov_b32 s2, -1
+; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-GISEL-NEXT: s_endpgm
+;
+; GFX10-SDAG-LABEL: fptrunc_f64_to_f32_afn:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX10-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: fptrunc_f64_to_f32_afn:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: fptrunc_f64_to_f32_afn:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: fptrunc_f64_to_f32_afn:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: s_endpgm
+ %result = fptrunc afn double %in to float
+ store float %result, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) {
; SI-LABEL: fptrunc_f64_to_f16:
; SI: ; %bb.0:
@@ -203,56 +280,56 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; VI-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-SAFE-SDAG-NEXT: s_endpgm
;
-; VI-SAFE-GISEL-LABEL: fptrunc_f64_to_f16:
-; VI-SAFE-GISEL: ; %bb.0:
-; VI-SAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
-; VI-SAFE-GISEL-NEXT: s_lshr_b32 s5, s3, 8
-; VI-SAFE-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
-; VI-SAFE-GISEL-NEXT: s_addk_i32 s4, 0xfc10
-; VI-SAFE-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
-; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s6, s2
-; VI-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s5, s2
-; VI-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s5, 1, 0
-; VI-SAFE-GISEL-NEXT: s_sub_i32 s7, 1, s4
-; VI-SAFE-GISEL-NEXT: s_lshl_b32 s6, s4, 12
-; VI-SAFE-GISEL-NEXT: s_max_i32 s7, s7, 0
-; VI-SAFE-GISEL-NEXT: s_or_b32 s6, s2, s6
-; VI-SAFE-GISEL-NEXT: s_min_i32 s7, s7, 13
-; VI-SAFE-GISEL-NEXT: s_bitset1_b32 s2, 12
-; VI-SAFE-GISEL-NEXT: s_lshl_b32 s5, s5, 9
-; VI-SAFE-GISEL-NEXT: s_lshr_b32 s8, s2, s7
-; VI-SAFE-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
-; VI-SAFE-GISEL-NEXT: s_lshl_b32 s7, s8, s7
-; VI-SAFE-GISEL-NEXT: s_cmp_lg_u32 s7, s2
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s8, s2
-; VI-SAFE-GISEL-NEXT: s_cmp_lt_i32 s4, 1
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, s2, s6
-; VI-SAFE-GISEL-NEXT: s_and_b32 s6, s2, 7
-; VI-SAFE-GISEL-NEXT: s_lshr_b32 s2, s2, 2
-; VI-SAFE-GISEL-NEXT: s_cmp_eq_u32 s6, 3
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s7, 1, 0
-; VI-SAFE-GISEL-NEXT: s_cmp_gt_i32 s6, 5
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0
-; VI-SAFE-GISEL-NEXT: s_or_b32 s6, s7, s6
-; VI-SAFE-GISEL-NEXT: s_add_i32 s2, s2, s6
-; VI-SAFE-GISEL-NEXT: s_cmp_gt_i32 s4, 30
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
-; VI-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, s5, s2
-; VI-SAFE-GISEL-NEXT: s_lshr_b32 s3, s3, 16
-; VI-SAFE-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
-; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s3, s2
-; VI-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-SAFE-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-SAFE-GISEL-NEXT: s_mov_b32 s3, 0xf000
-; VI-SAFE-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
-; VI-SAFE-GISEL-NEXT: s_endpgm
+; VI-GISEL-LABEL: fptrunc_f64_to_f16:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
+; VI-GISEL-NEXT: s_lshr_b32 s5, s3, 8
+; VI-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
+; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10
+; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
+; VI-GISEL-NEXT: s_or_b32 s2, s6, s2
+; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; VI-GISEL-NEXT: s_or_b32 s2, s5, s2
+; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
+; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4
+; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12
+; VI-GISEL-NEXT: s_max_i32 s7, s7, 0
+; VI-GISEL-NEXT: s_or_b32 s6, s2, s6
+; VI-GISEL-NEXT: s_min_i32 s7, s7, 13
+; VI-GISEL-NEXT: s_bitset1_b32 s2, 12
+; VI-GISEL-NEXT: s_lshl_b32 s5, s5, 9
+; VI-GISEL-NEXT: s_lshr_b32 s8, s2, s7
+; VI-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
+; VI-GISEL-NEXT: s_lshl_b32 s7, s8, s7
+; VI-GISEL-NEXT: s_cmp_lg_u32 s7, s2
+; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; VI-GISEL-NEXT: s_or_b32 s2, s8, s2
+; VI-GISEL-NEXT: s_cmp_lt_i32 s4, 1
+; VI-GISEL-NEXT: s_cselect_b32 s2, s2, s6
+; VI-GISEL-NEXT: s_and_b32 s6, s2, 7
+; VI-GISEL-NEXT: s_lshr_b32 s2, s2, 2
+; VI-GISEL-NEXT: s_cmp_eq_u32 s6, 3
+; VI-GISEL-NEXT: s_cselect_b32 s7, 1, 0
+; VI-GISEL-NEXT: s_cmp_gt_i32 s6, 5
+; VI-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; VI-GISEL-NEXT: s_or_b32 s6, s7, s6
+; VI-GISEL-NEXT: s_add_i32 s2, s2, s6
+; VI-GISEL-NEXT: s_cmp_gt_i32 s4, 30
+; VI-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
+; VI-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
+; VI-GISEL-NEXT: s_cselect_b32 s2, s5, s2
+; VI-GISEL-NEXT: s_lshr_b32 s3, s3, 16
+; VI-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
+; VI-GISEL-NEXT: s_or_b32 s2, s3, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT: s_mov_b32 s2, -1
+; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-GISEL-NEXT: s_endpgm
;
; VI-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16:
; VI-UNSAFE-SDAG: ; %bb.0:
@@ -265,17 +342,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; VI-UNSAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-UNSAFE-SDAG-NEXT: s_endpgm
;
-; VI-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16:
-; VI-UNSAFE-GISEL: ; %bb.0:
-; VI-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-UNSAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; VI-UNSAFE-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-UNSAFE-GISEL-NEXT: s_mov_b32 s3, 0xf000
-; VI-UNSAFE-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-UNSAFE-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
-; VI-UNSAFE-GISEL-NEXT: s_endpgm
-;
; GFX10-SAFE-SDAG-LABEL: fptrunc_f64_to_f16:
; GFX10-SAFE-SDAG: ; %bb.0:
; GFX10-SAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -328,56 +394,56 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX10-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX10-SAFE-SDAG-NEXT: s_endpgm
;
-; GFX10-SAFE-GISEL-LABEL: fptrunc_f64_to_f16:
-; GFX10-SAFE-GISEL: ; %bb.0:
-; GFX10-SAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SAFE-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
-; GFX10-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
-; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s5, s3, 8
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s6, s2
-; GFX10-SAFE-GISEL-NEXT: s_addk_i32 s4, 0xfc10
-; GFX10-SAFE-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX10-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s5, s2
-; GFX10-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s5, 1, 0
-; GFX10-SAFE-GISEL-NEXT: s_sub_i32 s6, 1, s4
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s8, s2, 0x1000
-; GFX10-SAFE-GISEL-NEXT: s_max_i32 s6, s6, 0
-; GFX10-SAFE-GISEL-NEXT: s_lshl_b32 s7, s4, 12
-; GFX10-SAFE-GISEL-NEXT: s_min_i32 s6, s6, 13
-; GFX10-SAFE-GISEL-NEXT: s_lshl_b32 s5, s5, 9
-; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s9, s8, s6
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s2, s7
-; GFX10-SAFE-GISEL-NEXT: s_lshl_b32 s6, s9, s6
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
-; GFX10-SAFE-GISEL-NEXT: s_cmp_lg_u32 s6, s8
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s6, s9, s6
-; GFX10-SAFE-GISEL-NEXT: s_cmp_lt_i32 s4, 1
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, s6, s2
-; GFX10-SAFE-GISEL-NEXT: s_and_b32 s6, s2, 7
-; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s2, s2, 2
-; GFX10-SAFE-GISEL-NEXT: s_cmp_eq_u32 s6, 3
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s7, 1, 0
-; GFX10-SAFE-GISEL-NEXT: s_cmp_gt_i32 s6, 5
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s6, s7, s6
-; GFX10-SAFE-GISEL-NEXT: s_add_i32 s2, s2, s6
-; GFX10-SAFE-GISEL-NEXT: s_cmp_gt_i32 s4, 30
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
-; GFX10-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, s5, s2
-; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s3, s3, 16
-; GFX10-SAFE-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s3, s2
-; GFX10-SAFE-GISEL-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-SAFE-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX10-SAFE-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
-; GFX10-SAFE-GISEL-NEXT: s_endpgm
+; GFX10-GISEL-LABEL: fptrunc_f64_to_f16:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX10-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
+; GFX10-GISEL-NEXT: s_lshr_b32 s5, s3, 8
+; GFX10-GISEL-NEXT: s_or_b32 s2, s6, s2
+; GFX10-GISEL-NEXT: s_addk_i32 s4, 0xfc10
+; GFX10-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX10-GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GFX10-GISEL-NEXT: s_or_b32 s2, s5, s2
+; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX10-GISEL-NEXT: s_cselect_b32 s5, 1, 0
+; GFX10-GISEL-NEXT: s_sub_i32 s6, 1, s4
+; GFX10-GISEL-NEXT: s_or_b32 s8, s2, 0x1000
+; GFX10-GISEL-NEXT: s_max_i32 s6, s6, 0
+; GFX10-GISEL-NEXT: s_lshl_b32 s7, s4, 12
+; GFX10-GISEL-NEXT: s_min_i32 s6, s6, 13
+; GFX10-GISEL-NEXT: s_lshl_b32 s5, s5, 9
+; GFX10-GISEL-NEXT: s_lshr_b32 s9, s8, s6
+; GFX10-GISEL-NEXT: s_or_b32 s2, s2, s7
+; GFX10-GISEL-NEXT: s_lshl_b32 s6, s9, s6
+; GFX10-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
+; GFX10-GISEL-NEXT: s_cmp_lg_u32 s6, s8
+; GFX10-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; GFX10-GISEL-NEXT: s_or_b32 s6, s9, s6
+; GFX10-GISEL-NEXT: s_cmp_lt_i32 s4, 1
+; GFX10-GISEL-NEXT: s_cselect_b32 s2, s6, s2
+; GFX10-GISEL-NEXT: s_and_b32 s6, s2, 7
+; GFX10-GISEL-NEXT: s_lshr_b32 s2, s2, 2
+; GFX10-GISEL-NEXT: s_cmp_eq_u32 s6, 3
+; GFX10-GISEL-NEXT: s_cselect_b32 s7, 1, 0
+; GFX10-GISEL-NEXT: s_cmp_gt_i32 s6, 5
+; GFX10-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; GFX10-GISEL-NEXT: s_or_b32 s6, s7, s6
+; GFX10-GISEL-NEXT: s_add_i32 s2, s2, s6
+; GFX10-GISEL-NEXT: s_cmp_gt_i32 s4, 30
+; GFX10-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
+; GFX10-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
+; GFX10-GISEL-NEXT: s_cselect_b32 s2, s5, s2
+; GFX10-GISEL-NEXT: s_lshr_b32 s3, s3, 16
+; GFX10-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
+; GFX10-GISEL-NEXT: s_or_b32 s2, s3, s2
+; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX10-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX10-GISEL-NEXT: s_endpgm
;
; GFX10-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16:
; GFX10-UNSAFE-SDAG: ; %bb.0:
@@ -390,17 +456,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX10-UNSAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX10-UNSAFE-SDAG-NEXT: s_endpgm
;
-; GFX10-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16:
-; GFX10-UNSAFE-GISEL: ; %bb.0:
-; GFX10-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-UNSAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX10-UNSAFE-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX10-UNSAFE-GISEL-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-UNSAFE-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX10-UNSAFE-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
-; GFX10-UNSAFE-GISEL-NEXT: s_endpgm
-;
; GFX11-SAFE-SDAG-LABEL: fptrunc_f64_to_f16:
; GFX11-SAFE-SDAG: ; %bb.0:
; GFX11-SAFE-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -461,62 +516,368 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-SAFE-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-SAFE-SDAG-NEXT: s_endpgm
;
-; GFX11-SAFE-GISEL-LABEL: fptrunc_f64_to_f16:
+; GFX11-GISEL-LABEL: fptrunc_f64_to_f16:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX11-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
+; GFX11-GISEL-NEXT: s_lshr_b32 s5, s3, 8
+; GFX11-GISEL-NEXT: s_or_b32 s2, s6, s2
+; GFX11-GISEL-NEXT: s_addk_i32 s4, 0xfc10
+; GFX11-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_or_b32 s2, s5, s2
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-GISEL-NEXT: s_sub_i32 s6, 1, s4
+; GFX11-GISEL-NEXT: s_or_b32 s8, s2, 0x1000
+; GFX11-GISEL-NEXT: s_max_i32 s6, s6, 0
+; GFX11-GISEL-NEXT: s_lshl_b32 s7, s4, 12
+; GFX11-GISEL-NEXT: s_min_i32 s6, s6, 13
+; GFX11-GISEL-NEXT: s_lshl_b32 s5, s5, 9
+; GFX11-GISEL-NEXT: s_lshr_b32 s9, s8, s6
+; GFX11-GISEL-NEXT: s_or_b32 s2, s2, s7
+; GFX11-GISEL-NEXT: s_lshl_b32 s6, s9, s6
+; GFX11-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s6, s8
+; GFX11-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_or_b32 s6, s9, s6
+; GFX11-GISEL-NEXT: s_cmp_lt_i32 s4, 1
+; GFX11-GISEL-NEXT: s_cselect_b32 s2, s6, s2
+; GFX11-GISEL-NEXT: s_and_b32 s6, s2, 7
+; GFX11-GISEL-NEXT: s_lshr_b32 s2, s2, 2
+; GFX11-GISEL-NEXT: s_cmp_eq_u32 s6, 3
+; GFX11-GISEL-NEXT: s_cselect_b32 s7, 1, 0
+; GFX11-GISEL-NEXT: s_cmp_gt_i32 s6, 5
+; GFX11-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_or_b32 s6, s7, s6
+; GFX11-GISEL-NEXT: s_add_i32 s2, s2, s6
+; GFX11-GISEL-NEXT: s_cmp_gt_i32 s4, 30
+; GFX11-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
+; GFX11-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
+; GFX11-GISEL-NEXT: s_cselect_b32 s2, s5, s2
+; GFX11-GISEL-NEXT: s_lshr_b32 s3, s3, 16
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
+; GFX11-GISEL-NEXT: s_or_b32 s2, s3, s2
+; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX11-UNSAFE-DAG-TRUE16-LABEL: fptrunc_f64_to_f16:
+; GFX11-UNSAFE-DAG-TRUE16: ; %bb.0:
+; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-UNSAFE-DAG-TRUE16-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_mov_b32 s2, -1
+; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-UNSAFE-DAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-UNSAFE-DAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_endpgm
+;
+; GFX11-UNSAFE-DAG-FAKE16-LABEL: fptrunc_f64_to_f16:
+; GFX11-UNSAFE-DAG-FAKE16: ; %bb.0:
+; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-UNSAFE-DAG-FAKE16-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-UNSAFE-DAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-UNSAFE-DAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_endpgm
+ %result = fptrunc double %in to half
+ %result_i16 = bitcast half %result to i16
+ store i16 %result_i16, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @fptrunc_f64_to_f16_afn(ptr addrspace(1) %out, double %in) {
+; SI-LABEL: fptrunc_f64_to_f16_afn:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_movk_i32 s2, 0x7e00
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_lshr_b32 s0, s7, 8
+; SI-NEXT: s_and_b32 s1, s7, 0x1ff
+; SI-NEXT: s_and_b32 s8, s0, 0xffe
+; SI-NEXT: s_or_b32 s0, s1, s6
+; SI-NEXT: s_cmp_lg_u32 s0, 0
+; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014
+; SI-NEXT: v_readfirstlane_b32 s1, v0
+; SI-NEXT: s_sub_i32 s6, 0x3f1, s0
+; SI-NEXT: s_or_b32 s1, s8, s1
+; SI-NEXT: v_med3_i32 v0, s6, 0, 13
+; SI-NEXT: s_or_b32 s6, s1, 0x1000
+; SI-NEXT: v_readfirstlane_b32 s8, v0
+; SI-NEXT: s_lshr_b32 s9, s6, s8
+; SI-NEXT: s_lshl_b32 s8, s9, s8
+; SI-NEXT: s_cmp_lg_u32 s8, s6
+; SI-NEXT: s_cselect_b32 s6, 1, 0
+; SI-NEXT: s_addk_i32 s0, 0xfc10
+; SI-NEXT: s_or_b32 s6, s9, s6
+; SI-NEXT: s_lshl_b32 s8, s0, 12
+; SI-NEXT: s_or_b32 s8, s1, s8
+; SI-NEXT: s_cmp_lt_i32 s0, 1
+; SI-NEXT: s_cselect_b32 s6, s6, s8
+; SI-NEXT: s_and_b32 s8, s6, 7
+; SI-NEXT: s_cmp_gt_i32 s8, 5
+; SI-NEXT: s_cselect_b32 s9, 1, 0
+; SI-NEXT: s_cmp_eq_u32 s8, 3
+; SI-NEXT: s_cselect_b32 s8, 1, 0
+; SI-NEXT: s_lshr_b32 s6, s6, 2
+; SI-NEXT: s_or_b32 s8, s8, s9
+; SI-NEXT: s_add_i32 s6, s6, s8
+; SI-NEXT: s_cmp_lt_i32 s0, 31
+; SI-NEXT: s_cselect_b32 s6, s6, 0x7c00
+; SI-NEXT: s_cmp_lg_u32 s1, 0
+; SI-NEXT: s_cselect_b32 s1, s2, 0x7c00
+; SI-NEXT: s_cmpk_eq_i32 s0, 0x40f
+; SI-NEXT: s_cselect_b32 s0, s1, s6
+; SI-NEXT: s_lshr_b32 s1, s7, 16
+; SI-NEXT: s_and_b32 s1, s1, 0x8000
+; SI-NEXT: s_or_b32 s6, s1, s0
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: v_mov_b32_e32 v0, s6
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-SAFE-SDAG-LABEL: fptrunc_f64_to_f16_afn:
+; VI-SAFE-SDAG: ; %bb.0:
+; VI-SAFE-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
+; VI-SAFE-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SAFE-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SAFE-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SAFE-SDAG-NEXT: s_lshr_b32 s4, s7, 8
+; VI-SAFE-SDAG-NEXT: s_and_b32 s8, s4, 0xffe
+; VI-SAFE-SDAG-NEXT: s_and_b32 s4, s7, 0x1ff
+; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s6
+; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0
+; VI-SAFE-SDAG-NEXT: s_mov_b32 s1, s5
+; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0
+; VI-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; VI-SAFE-SDAG-NEXT: v_readfirstlane_b32 s4, v0
+; VI-SAFE-SDAG-NEXT: s_bfe_u32 s6, s7, 0xb0014
+; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s8, s4
+; VI-SAFE-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s6
+; VI-SAFE-SDAG-NEXT: v_med3_i32 v0, s8, 0, 13
+; VI-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000
+; VI-SAFE-SDAG-NEXT: v_readfirstlane_b32 s8, v0
+; VI-SAFE-SDAG-NEXT: s_lshr_b32 s9, s5, s8
+; VI-SAFE-SDAG-NEXT: s_lshl_b32 s8, s9, s8
+; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s8, s5
+; VI-SAFE-SDAG-NEXT: s_cselect_b32 s5, 1, 0
+; VI-SAFE-SDAG-NEXT: s_addk_i32 s6, 0xfc10
+; VI-SAFE-SDAG-NEXT: s_lshl_b32 s8, s6, 12
+; VI-SAFE-SDAG-NEXT: s_or_b32 s5, s9, s5
+; VI-SAFE-SDAG-NEXT: s_or_b32 s8, s4, s8
+; VI-SAFE-SDAG-NEXT: s_cmp_lt_i32 s6, 1
+; VI-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s8
+; VI-SAFE-SDAG-NEXT: s_and_b32 s8, s5, 7
+; VI-SAFE-SDAG-NEXT: s_cmp_gt_i32 s8, 5
+; VI-SAFE-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; VI-SAFE-SDAG-NEXT: s_cmp_eq_u32 s8, 3
+; VI-SAFE-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; VI-SAFE-SDAG-NEXT: s_or_b32 s8, s8, s9
+; VI-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2
+; VI-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s8
+; VI-SAFE-SDAG-NEXT: s_cmp_lt_i32 s6, 31
+; VI-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0
+; VI-SAFE-SDAG-NEXT: s_movk_i32 s4, 0x7e00
+; VI-SAFE-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00
+; VI-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s6, 0x40f
+; VI-SAFE-SDAG-NEXT: s_cselect_b32 s4, s4, s5
+; VI-SAFE-SDAG-NEXT: s_lshr_b32 s5, s7, 16
+; VI-SAFE-SDAG-NEXT: s_and_b32 s5, s5, 0x8000
+; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s5, s4
+; VI-SAFE-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-SAFE-SDAG-NEXT: s_endpgm
+;
+; VI-GISEL-LABEL: fptrunc_f64_to_f16_afn:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; VI-GISEL-NEXT: s_mov_b32 s2, -1
+; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-GISEL-NEXT: s_endpgm
+;
+; VI-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16_afn:
+; VI-UNSAFE-SDAG: ; %bb.0:
+; VI-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; VI-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-UNSAFE-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-UNSAFE-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; VI-UNSAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-UNSAFE-SDAG-NEXT: s_endpgm
+;
+; GFX10-SAFE-SDAG-LABEL: fptrunc_f64_to_f16_afn:
+; GFX10-SAFE-SDAG: ; %bb.0:
+; GFX10-SAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SAFE-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff
+; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s5, s3, 8
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s2, s4, s2
+; GFX10-SAFE-SDAG-NEXT: s_and_b32 s4, s5, 0xffe
+; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s2, 0
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX10-SAFE-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014
+; GFX10-SAFE-SDAG-NEXT: s_sub_i32 s5, 0x3f1, s2
+; GFX10-SAFE-SDAG-NEXT: v_med3_i32 v1, s5, 0, 13
+; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0
+; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s6, v1
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s5
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000
+; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s7, s5, s6
+; GFX10-SAFE-SDAG-NEXT: s_lshl_b32 s6, s7, s6
+; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, s5
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, 1, 0
+; GFX10-SAFE-SDAG-NEXT: s_addk_i32 s2, 0xfc10
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s5, s7, s5
+; GFX10-SAFE-SDAG-NEXT: s_lshl_b32 s6, s2, 12
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s6, s4, s6
+; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s6
+; GFX10-SAFE-SDAG-NEXT: s_and_b32 s6, s5, 7
+; GFX10-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s7, 1, 0
+; GFX10-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s6, 1, 0
+; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s6, s6, s7
+; GFX10-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s6
+; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 31
+; GFX10-SAFE-SDAG-NEXT: s_movk_i32 s6, 0x7e00
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s4, s6, 0x7c00
+; GFX10-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s2, 0x40f
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s2, s4, s5
+; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s3, s3, 16
+; GFX10-SAFE-SDAG-NEXT: s_and_b32 s3, s3, 0x8000
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s2, s3, s2
+; GFX10-SAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-SAFE-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-SAFE-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX10-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX10-SAFE-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: fptrunc_f64_to_f16_afn:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX10-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16_afn:
+; GFX10-UNSAFE-SDAG: ; %bb.0:
+; GFX10-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX10-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-UNSAFE-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX10-UNSAFE-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-UNSAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX10-UNSAFE-SDAG-NEXT: s_endpgm
+;
+; GFX11-SAFE-SDAG-LABEL: fptrunc_f64_to_f16_afn:
+; GFX11-SAFE-SDAG: ; %bb.0:
+; GFX11-SAFE-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SAFE-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff
+; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s5, s3, 8
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s2, s4, s2
+; GFX11-SAFE-SDAG-NEXT: s_and_b32 s4, s5, 0xffe
+; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-SAFE-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014
+; GFX11-SAFE-SDAG-NEXT: s_sub_i32 s5, 0x3f1, s2
+; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-SDAG-NEXT: v_med3_i32 v1, s5, 0, 13
+; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0
+; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s6, v1
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s5
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000
+; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s7, s5, s6
+; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s6, s7, s6
+; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, s5
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-SAFE-SDAG-NEXT: s_addk_i32 s2, 0xfc10
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s7, s5
+; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s6, s2, 12
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s6, s4, s6
+; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s6
+; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT: s_and_b32 s6, s5, 7
+; GFX11-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s7, 1, 0
+; GFX11-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s6, s6, s7
+; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s6
+; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 31
+; GFX11-SAFE-SDAG-NEXT: s_movk_i32 s6, 0x7e00
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s4, s6, 0x7c00
+; GFX11-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s2, 0x40f
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s2, s4, s5
+; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s3, s3, 16
+; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT: s_and_b32 s3, s3, 0x8000
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s2, s3, s2
+; GFX11-SAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SAFE-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SAFE-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SAFE-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-SAFE-SDAG-NEXT: s_endpgm
+;
+; GFX11-SAFE-GISEL-LABEL: fptrunc_f64_to_f16_afn:
; GFX11-SAFE-GISEL: ; %bb.0:
; GFX11-SAFE-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SAFE-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
-; GFX11-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
-; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s5, s3, 8
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s6, s2
-; GFX11-SAFE-GISEL-NEXT: s_addk_i32 s4, 0xfc10
-; GFX11-SAFE-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX11-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s5, s2
-; GFX11-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s5, 1, 0
-; GFX11-SAFE-GISEL-NEXT: s_sub_i32 s6, 1, s4
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s8, s2, 0x1000
-; GFX11-SAFE-GISEL-NEXT: s_max_i32 s6, s6, 0
-; GFX11-SAFE-GISEL-NEXT: s_lshl_b32 s7, s4, 12
-; GFX11-SAFE-GISEL-NEXT: s_min_i32 s6, s6, 13
-; GFX11-SAFE-GISEL-NEXT: s_lshl_b32 s5, s5, 9
-; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s9, s8, s6
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s2, s7
-; GFX11-SAFE-GISEL-NEXT: s_lshl_b32 s6, s9, s6
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
-; GFX11-SAFE-GISEL-NEXT: s_cmp_lg_u32 s6, s8
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0
-; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s6, s9, s6
-; GFX11-SAFE-GISEL-NEXT: s_cmp_lt_i32 s4, 1
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, s6, s2
-; GFX11-SAFE-GISEL-NEXT: s_and_b32 s6, s2, 7
-; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s2, s2, 2
-; GFX11-SAFE-GISEL-NEXT: s_cmp_eq_u32 s6, 3
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s7, 1, 0
-; GFX11-SAFE-GISEL-NEXT: s_cmp_gt_i32 s6, 5
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0
-; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s6, s7, s6
-; GFX11-SAFE-GISEL-NEXT: s_add_i32 s2, s2, s6
-; GFX11-SAFE-GISEL-NEXT: s_cmp_gt_i32 s4, 30
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
-; GFX11-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, s5, s2
-; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s3, s3, 16
-; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s3, s2
-; GFX11-SAFE-GISEL-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
; GFX11-SAFE-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-SAFE-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-GISEL-NEXT: v_cvt_f16_f32_e32 v0.l, v0
; GFX11-SAFE-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-SAFE-GISEL-NEXT: s_endpgm
;
-; GFX11-UNSAFE-DAG-TRUE16-LABEL: fptrunc_f64_to_f16:
+; GFX11-UNSAFE-DAG-TRUE16-LABEL: fptrunc_f64_to_f16_afn:
; GFX11-UNSAFE-DAG-TRUE16: ; %bb.0:
; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
@@ -528,7 +889,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-UNSAFE-DAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_endpgm
;
-; GFX11-UNSAFE-DAG-FAKE16-LABEL: fptrunc_f64_to_f16:
+; GFX11-UNSAFE-DAG-FAKE16-LABEL: fptrunc_f64_to_f16_afn:
; GFX11-UNSAFE-DAG-FAKE16: ; %bb.0:
; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
@@ -540,7 +901,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-UNSAFE-DAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_endpgm
;
-; GFX11-UNSAFE-GISEL-TRUE16-LABEL: fptrunc_f64_to_f16:
+; GFX11-UNSAFE-GISEL-TRUE16-LABEL: fptrunc_f64_to_f16_afn:
; GFX11-UNSAFE-GISEL-TRUE16: ; %bb.0:
; GFX11-UNSAFE-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-UNSAFE-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
@@ -552,7 +913,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-UNSAFE-GISEL-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-UNSAFE-GISEL-TRUE16-NEXT: s_endpgm
;
-; GFX11-UNSAFE-GISEL-FAKE16-LABEL: fptrunc_f64_to_f16:
+; GFX11-UNSAFE-GISEL-FAKE16-LABEL: fptrunc_f64_to_f16_afn:
; GFX11-UNSAFE-GISEL-FAKE16: ; %bb.0:
; GFX11-UNSAFE-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-UNSAFE-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
@@ -563,7 +924,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-UNSAFE-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-UNSAFE-GISEL-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-UNSAFE-GISEL-FAKE16-NEXT: s_endpgm
- %result = fptrunc double %in to half
+ %result = fptrunc afn double %in to half
%result_i16 = bitcast half %result to i16
store i16 %result_i16, ptr addrspace(1) %out
ret void
@@ -662,6 +1023,99 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do
ret void
}
+define amdgpu_kernel void @fptrunc_v2f64_to_v2f32_afn(ptr addrspace(1) %out, <2 x double> %in) {
+; SI-LABEL: fptrunc_v2f64_to_v2f32_afn:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
+; SI-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f32_afn:
+; VI-SDAG: ; %bb.0:
+; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s6, -1
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; VI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-SDAG-NEXT: s_endpgm
+;
+; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f32_afn:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; VI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
+; VI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-GISEL-NEXT: s_endpgm
+;
+; GFX10-SDAG-LABEL: fptrunc_v2f64_to_v2f32_afn:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: fptrunc_v2f64_to_v2f32_afn:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: fptrunc_v2f64_to_v2f32_afn:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: fptrunc_v2f64_to_v2f32_afn:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-GISEL-NEXT: s_endpgm
+ %result = fptrunc afn <2 x double> %in to <2 x float>
+ store <2 x float> %result, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x double> %in) {
; SI-LABEL: fptrunc_v3f64_to_v3f32:
; SI: ; %bb.0:
@@ -769,6 +1223,113 @@ define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x do
ret void
}
+define amdgpu_kernel void @fptrunc_v3f64_to_v3f32_afn(ptr addrspace(1) %out, <3 x double> %in) {
+; SI-LABEL: fptrunc_v3f64_to_v3f32_afn:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x11
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x15
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; SI-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; SI-NEXT: v_cvt_f32_f64_e32 v2, s[4:5]
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-SDAG-LABEL: fptrunc_v3f64_to_v3f32_afn:
+; VI-SDAG: ; %bb.0:
+; VI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x54
+; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44
+; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[6:7]
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s6, -1
+; VI-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; VI-SDAG-NEXT: s_endpgm
+;
+; VI-GISEL-LABEL: fptrunc_v3f64_to_v3f32_afn:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-GISEL-NEXT: s_mov_b32 s2, -1
+; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; VI-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; VI-GISEL-NEXT: s_endpgm
+;
+; GFX10-SDAG-LABEL: fptrunc_v3f64_to_v3f32_afn:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x54
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[6:7]
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: fptrunc_v3f64_to_v3f32_afn:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX10-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: fptrunc_v3f64_to_v3f32_afn:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x54
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x44
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[6:7]
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], 0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: fptrunc_v3f64_to_v3f32_afn:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b256 s[8:15], s[4:5], 0x44
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX11-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[0:3], 0
+; GFX11-GISEL-NEXT: s_endpgm
+ %result = fptrunc afn <3 x double> %in to <3 x float>
+ store <3 x float> %result, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x double> %in) {
; SI-LABEL: fptrunc_v4f64_to_v4f32:
; SI: ; %bb.0:
@@ -876,6 +1437,113 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do
ret void
}
+define amdgpu_kernel void @fptrunc_v4f64_to_v4f32_afn(ptr addrspace(1) %out, <4 x double> %in) {
+; SI-LABEL: fptrunc_v4f64_to_v4f32_afn:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f64_e32 v3, s[14:15]
+; SI-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; SI-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; SI-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-SDAG-LABEL: fptrunc_v4f64_to_v4f32_afn:
+; VI-SDAG: ; %bb.0:
+; VI-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[14:15]
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; VI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-SDAG-NEXT: s_endpgm
+;
+; VI-GISEL-LABEL: fptrunc_v4f64_to_v4f32_afn:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-GISEL-NEXT: s_mov_b32 s2, -1
+; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[14:15]
+; VI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-GISEL-NEXT: s_endpgm
+;
+; GFX10-SDAG-LABEL: fptrunc_v4f64_to_v4f32_afn:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[14:15]
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: fptrunc_v4f64_to_v4f32_afn:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[14:15]
+; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: fptrunc_v4f64_to_v4f32_afn:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b256 s[8:15], s[4:5], 0x44
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[14:15]
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX11-SDAG-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: fptrunc_v4f64_to_v4f32_afn:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b256 s[8:15], s[4:5], 0x44
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[14:15]
+; GFX11-GISEL-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-GISEL-NEXT: s_endpgm
+ %result = fptrunc afn <4 x double> %in to <4 x float>
+ store <4 x float> %result, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x double> %in) {
; SI-LABEL: fptrunc_v8f64_to_v8f32:
; SI: ; %bb.0:
@@ -1019,3 +1687,150 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do
store <8 x float> %result, ptr addrspace(1) %out
ret void
}
+
+define amdgpu_kernel void @fptrunc_v8f64_to_v8f32_afn(ptr addrspace(1) %out, <8 x double> %in) {
+; SI-LABEL: fptrunc_v8f64_to_v8f32_afn:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f64_e32 v3, s[14:15]
+; SI-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; SI-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; SI-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; SI-NEXT: v_cvt_f32_f64_e32 v7, s[22:23]
+; SI-NEXT: v_cvt_f32_f64_e32 v6, s[20:21]
+; SI-NEXT: v_cvt_f32_f64_e32 v5, s[18:19]
+; SI-NEXT: v_cvt_f32_f64_e32 v4, s[16:17]
+; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-SDAG-LABEL: fptrunc_v8f64_to_v8f32_afn:
+; VI-SDAG: ; %bb.0:
+; VI-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v7, s[22:23]
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v6, s[20:21]
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v5, s[18:19]
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v4, s[16:17]
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[14:15]
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; VI-SDAG-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-SDAG-NEXT: s_endpgm
+;
+; VI-GISEL-LABEL: fptrunc_v8f64_to_v8f32_afn:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-GISEL-NEXT: s_mov_b32 s2, -1
+; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[14:15]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v4, s[16:17]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v5, s[18:19]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v6, s[20:21]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v7, s[22:23]
+; VI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-GISEL-NEXT: s_endpgm
+;
+; GFX10-SDAG-LABEL: fptrunc_v8f64_to_v8f32_afn:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v7, s[22:23]
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v6, s[20:21]
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v5, s[18:19]
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v4, s[16:17]
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[14:15]
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: fptrunc_v8f64_to_v8f32_afn:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[14:15]
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v4, s[16:17]
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v5, s[18:19]
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v6, s[20:21]
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v7, s[22:23]
+; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: fptrunc_v8f64_to_v8f32_afn:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0x64
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v7, s[22:23]
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v6, s[20:21]
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v5, s[18:19]
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v4, s[16:17]
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[14:15]
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16
+; GFX11-SDAG-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: fptrunc_v8f64_to_v8f32_afn:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0x64
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[14:15]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v4, s[16:17]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v5, s[18:19]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v6, s[20:21]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v7, s[22:23]
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16
+; GFX11-GISEL-NEXT: s_endpgm
+ %result = fptrunc <8 x double> %in to <8 x float>
+ store <8 x float> %result, ptr addrspace(1) %out
+ ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10-SAFE-GISEL: {{.*}}
+; VI-SAFE-GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
new file mode 100644
index 0000000..d13d76f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
@@ -0,0 +1,100 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O3 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O3 -mcpu=gfx1250 -mattr=-cu-stores < %s | FileCheck --check-prefixes=GCN,NOCU %s
+
+; Check that if -cu-stores is used, we use SCOPE_SE minimum on all stores.
+
+; GCN: flat_store:
+; CU: flat_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; NOCU: flat_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; GCN: .amdhsa_kernel flat_store
+; CU: .amdhsa_uses_cu_stores 1
+; NOCU: .amdhsa_uses_cu_stores 0
+define amdgpu_kernel void @flat_store(ptr %dst, i32 %val) {
+entry:
+ store i32 %val, ptr %dst
+ ret void
+}
+
+; GCN: global_store:
+; CU: global_store_b32 v{{.*}}, v{{.*}}, s{{.*}}{{$}}
+; NOCU: global_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; GCN: .amdhsa_kernel global_store
+; CU: .amdhsa_uses_cu_stores 1
+; NOCU: .amdhsa_uses_cu_stores 0
+define amdgpu_kernel void @global_store(ptr addrspace(1) %dst, i32 %val) {
+entry:
+ store i32 %val, ptr addrspace(1) %dst
+ ret void
+}
+
+; GCN: local_store:
+; CU: ds_store_b32 v{{.*}}, v{{.*}}{{$}}
+; NOCU: ds_store_b32 v{{.*}}, v{{.*}}{{$}}
+; GCN: .amdhsa_kernel local_store
+; CU: .amdhsa_uses_cu_stores 1
+; NOCU: .amdhsa_uses_cu_stores 0
+define amdgpu_kernel void @local_store(ptr addrspace(3) %dst, i32 %val) {
+entry:
+ store i32 %val, ptr addrspace(3) %dst
+ ret void
+}
+
+; GCN: scratch_store:
+; CU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; NOCU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; GCN: .amdhsa_kernel scratch_store
+; CU: .amdhsa_uses_cu_stores 1
+; NOCU: .amdhsa_uses_cu_stores 0
+define amdgpu_kernel void @scratch_store(ptr addrspace(5) %dst, i32 %val) {
+entry:
+ store i32 %val, ptr addrspace(5) %dst
+ ret void
+}
+
+; GCN: flat_atomic_store:
+; CU: flat_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; NOCU: flat_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; GCN: .amdhsa_kernel flat_atomic_store
+; CU: .amdhsa_uses_cu_stores 1
+; NOCU: .amdhsa_uses_cu_stores 0
+define amdgpu_kernel void @flat_atomic_store(ptr %dst, i32 %val) {
+entry:
+ store atomic i32 %val, ptr %dst syncscope("wavefront") unordered, align 4
+ ret void
+}
+
+; GCN: global_atomic_store:
+; CU: global_store_b32 v{{.*}}, v{{.*}}, s{{.*}}{{$}}
+; NOCU: global_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; GCN: .amdhsa_kernel global_atomic_store
+; CU: .amdhsa_uses_cu_stores 1
+; NOCU: .amdhsa_uses_cu_stores 0
+define amdgpu_kernel void @global_atomic_store(ptr addrspace(1) %dst, i32 %val) {
+entry:
+ store atomic i32 %val, ptr addrspace(1) %dst syncscope("wavefront") unordered, align 4
+ ret void
+}
+
+; GCN: local_atomic_store:
+; CU: ds_store_b32 v{{.*}}, v{{.*}}{{$}}
+; NOCU: ds_store_b32 v{{.*}}, v{{.*}}{{$}}
+; GCN: .amdhsa_kernel local_atomic_store
+; CU: .amdhsa_uses_cu_stores 1
+; NOCU: .amdhsa_uses_cu_stores 0
+define amdgpu_kernel void @local_atomic_store(ptr addrspace(3) %dst, i32 %val) {
+entry:
+ store atomic i32 %val, ptr addrspace(3) %dst syncscope("wavefront") unordered, align 4
+ ret void
+}
+
+; GCN: scratch_atomic_store:
+; CU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; NOCU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; GCN: .amdhsa_kernel scratch_atomic_store
+; CU: .amdhsa_uses_cu_stores 1
+; NOCU: .amdhsa_uses_cu_stores 0
+define amdgpu_kernel void @scratch_atomic_store(ptr addrspace(5) %dst, i32 %val) {
+entry:
+ store atomic i32 %val, ptr addrspace(5) %dst syncscope("wavefront") unordered, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll b/llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll
new file mode 100644
index 0000000..d1e82a0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-GISEL %s
+
+; Test that stores that may hit scratch are correctly promoted to SCOPE_SE.
+
+define void @test_scratch_store(ptr addrspace(5) %ptr, i32 %val) {
+; GCN-LABEL: test_scratch_store:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: scratch_store_b32 v0, v1, off scope:SCOPE_SE
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ store i32 %val, ptr addrspace(5) %ptr
+ ret void
+}
+
+define void @test_unknown_flat_store(ptr %ptr, i32 %val) {
+; GCN-LABEL: test_unknown_flat_store:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GCN-NEXT: s_wait_dscnt 0x0
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ store i32 %val, ptr %ptr
+ ret void
+}
+
+define void @test_flat_store_no_scratch_alloc(ptr %ptr, i32 %val) #0 {
+; GCN-LABEL: test_flat_store_no_scratch_alloc:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: flat_store_b32 v[0:1], v2
+; GCN-NEXT: s_wait_dscnt 0x0
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ store i32 %val, ptr %ptr
+ ret void
+}
+
+; TODO: handle
+define void @test_flat_store_noalias_addrspace(ptr %ptr, i32 %val) {
+; GCN-LABEL: test_flat_store_noalias_addrspace:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GCN-NEXT: s_wait_dscnt 0x0
+; GCN-NEXT: s_set_pc_i64 s[30:31]
+ store i32 %val, ptr %ptr, !noalias.addrspace !{i32 5, i32 6}
+ ret void
+}
+
+; TODO: would be nice to handle too
+define void @test_flat_store_select(ptr addrspace(1) %a, ptr addrspace(3) %b, i1 %cond, i32 %val) {
+; GCN-SDAG-LABEL: test_flat_store_select:
+; GCN-SDAG: ; %bb.0:
+; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
+; GCN-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v2
+; GCN-SDAG-NEXT: v_and_b32_e32 v3, 1, v3
+; GCN-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo
+; GCN-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, s1, vcc_lo
+; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GCN-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GCN-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v2, v0
+; GCN-SDAG-NEXT: flat_store_b32 v[0:1], v4 scope:SCOPE_SE
+; GCN-SDAG-NEXT: s_wait_dscnt 0x0
+; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GCN-GISEL-LABEL: test_flat_store_select:
+; GCN-GISEL: ; %bb.0:
+; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
+; GCN-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v2
+; GCN-GISEL-NEXT: v_and_b32_e32 v3, 1, v3
+; GCN-GISEL-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GCN-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo
+; GCN-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, s1, vcc_lo
+; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GCN-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GCN-GISEL-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v5, v1
+; GCN-GISEL-NEXT: flat_store_b32 v[0:1], v4 scope:SCOPE_SE
+; GCN-GISEL-NEXT: s_wait_dscnt 0x0
+; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31]
+ %a.ascast = addrspacecast ptr addrspace(1) %a to ptr
+ %b.ascast = addrspacecast ptr addrspace(3) %b to ptr
+ %ptr = select i1 %cond, ptr %a.ascast, ptr %b.ascast
+ store i32 %val, ptr %ptr
+ ret void
+}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll b/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll
index 99690e4..fe8edd5 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll
@@ -1,4 +1,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX908 %s
+
+; Make sure flag is ignored
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -amdgpu-mfma-vgpr-form=1 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX908 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX90A %s
; GFX9-DAG: buffer_load_format_xyzw v[{{[0-9:]+}}], v{{[0-9]+}}, s[{{[0-9:]+}}], 0 idxen ; encoding:
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
index fd644a3..3a898a9 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
@@ -124,27 +124,27 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac
; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
; GCN-SDAG-NEXT: s_clause 0xd
-; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:52
-; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:48
-; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:44
-; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 offset:40
-; GCN-SDAG-NEXT: scratch_store_b32 off, v44, s32 offset:36
-; GCN-SDAG-NEXT: scratch_store_b32 off, v45, s32 offset:32
-; GCN-SDAG-NEXT: scratch_store_b32 off, v56, s32 offset:28
-; GCN-SDAG-NEXT: scratch_store_b32 off, v57, s32 offset:24
-; GCN-SDAG-NEXT: scratch_store_b32 off, v58, s32 offset:20
-; GCN-SDAG-NEXT: scratch_store_b32 off, v59, s32 offset:16
-; GCN-SDAG-NEXT: scratch_store_b32 off, v60, s32 offset:12
-; GCN-SDAG-NEXT: scratch_store_b32 off, v61, s32 offset:8
-; GCN-SDAG-NEXT: scratch_store_b32 off, v62, s32 offset:4
-; GCN-SDAG-NEXT: scratch_store_b32 off, v63, s32
+; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:52 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:48 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:44 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 offset:40 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v44, s32 offset:36 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v45, s32 offset:32 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v56, s32 offset:28 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v57, s32 offset:24 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v58, s32 offset:20 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v59, s32 offset:16 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v60, s32 offset:12 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v61, s32 offset:8 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v62, s32 offset:4 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v63, s32 scope:SCOPE_SE
; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:224
; GCN-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:56 ; 16-byte Folded Spill
+; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:56 scope:SCOPE_SE ; 16-byte Folded Spill
; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:240
; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:72 ; 16-byte Folded Spill
+; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:72 scope:SCOPE_SE ; 16-byte Folded Spill
; GCN-SDAG-NEXT: s_clause 0xd
; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:192
; GCN-SDAG-NEXT: global_load_b128 v[14:17], v[0:1], off offset:208
@@ -206,27 +206,27 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac
; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
; GCN-GISEL-NEXT: s_clause 0xf
-; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:60
-; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:56
-; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:52
-; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:48
-; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:44
-; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 offset:40
-; GCN-GISEL-NEXT: scratch_store_b32 off, v46, s32 offset:36
-; GCN-GISEL-NEXT: scratch_store_b32 off, v47, s32 offset:32
-; GCN-GISEL-NEXT: scratch_store_b32 off, v56, s32 offset:28
-; GCN-GISEL-NEXT: scratch_store_b32 off, v57, s32 offset:24
-; GCN-GISEL-NEXT: scratch_store_b32 off, v58, s32 offset:20
-; GCN-GISEL-NEXT: scratch_store_b32 off, v59, s32 offset:16
-; GCN-GISEL-NEXT: scratch_store_b32 off, v60, s32 offset:12
-; GCN-GISEL-NEXT: scratch_store_b32 off, v61, s32 offset:8
-; GCN-GISEL-NEXT: scratch_store_b32 off, v62, s32 offset:4
-; GCN-GISEL-NEXT: scratch_store_b32 off, v63, s32
+; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:60 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:56 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:52 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:48 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:44 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 offset:40 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v46, s32 offset:36 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v47, s32 offset:32 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v56, s32 offset:28 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v57, s32 offset:24 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v58, s32 offset:20 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v59, s32 offset:16 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v60, s32 offset:12 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v61, s32 offset:8 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v62, s32 offset:4 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v63, s32 scope:SCOPE_SE
; GCN-GISEL-NEXT: s_wait_xcnt 0x8
; GCN-GISEL-NEXT: v_dual_mov_b32 v46, v3 :: v_dual_mov_b32 v47, v4
; GCN-GISEL-NEXT: global_load_b128 v[2:5], v[0:1], off offset:32
; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT: scratch_store_b128 off, v[2:5], s32 offset:80 ; 16-byte Folded Spill
+; GCN-GISEL-NEXT: scratch_store_b128 off, v[2:5], s32 offset:80 scope:SCOPE_SE ; 16-byte Folded Spill
; GCN-GISEL-NEXT: s_clause 0xe
; GCN-GISEL-NEXT: global_load_b128 v[6:9], v[0:1], off offset:48
; GCN-GISEL-NEXT: global_load_b128 v[10:13], v[0:1], off offset:64
@@ -244,7 +244,7 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac
; GCN-GISEL-NEXT: global_load_b128 v[60:63], v[0:1], off offset:16
; GCN-GISEL-NEXT: global_load_b128 v[0:3], v[0:1], off offset:240
; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:64 ; 16-byte Folded Spill
+; GCN-GISEL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:64 scope:SCOPE_SE ; 16-byte Folded Spill
; GCN-GISEL-NEXT: scratch_load_b128 v[0:3], off, s32 offset:80 th:TH_LOAD_LU ; 16-byte Folded Reload
; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
; GCN-GISEL-NEXT: s_clause 0xe
@@ -299,10 +299,10 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
; GCN-SDAG-NEXT: s_clause 0x3
-; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:12
-; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:8
-; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:4
-; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32
+; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:12 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:8 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:4 scope:SCOPE_SE
+; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 scope:SCOPE_SE
; GCN-SDAG-NEXT: s_clause 0x7
; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:112
; GCN-SDAG-NEXT: global_load_b128 v[18:21], v[0:1], off offset:96
@@ -385,12 +385,12 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
; GCN-GISEL-NEXT: s_clause 0x5
-; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:20
-; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:16
-; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:12
-; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:8
-; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:4
-; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32
+; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:20 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:16 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:12 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:8 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:4 scope:SCOPE_SE
+; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 scope:SCOPE_SE
; GCN-GISEL-NEXT: s_clause 0x7
; GCN-GISEL-NEXT: global_load_b128 v[6:9], v[0:1], off offset:80
; GCN-GISEL-NEXT: global_load_b128 v[10:13], v[0:1], off
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-gfx1250.mir
new file mode 100644
index 0000000..8007597
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-gfx1250.mir
@@ -0,0 +1,33 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefixes=GFX12
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefixes=GFX12
+
+---
+name: flat_prefetch_flat_load
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+ ; GFX12-LABEL: name: flat_prefetch_flat_load
+ ; GFX12: liveins: $vgpr0_vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: FLAT_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX12-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ FLAT_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+...
+
+---
+name: global_prefetch_flat_load
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+ ; GFX12-LABEL: name: global_prefetch_flat_load
+ ; GFX12: liveins: $vgpr0_vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: GLOBAL_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GFX12-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ GLOBAL_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec
+ $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+...
diff --git a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll
index 258aa9e..0a493e51 100644
--- a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll
@@ -8,15 +8,15 @@ define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) {
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_ashr_i32 s7, s6, 31
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-NEXT: v_mov_b32_e32 v3, s3
; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3
; CHECK-NEXT: s_add_u32 s0, s2, s0
; CHECK-NEXT: s_addc_u32 s1, s3, s1
-; CHECK-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-NEXT: v_add_co_u32_e64 v2, vcc, -8, s0
-; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
-; CHECK-NEXT: flat_atomic_add_f64 v[2:3], v[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: v_add_co_u32_e64 v0, vcc, -8, s0
+; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_wbinvl1_vol
; CHECK-NEXT: s_endpgm
@@ -35,15 +35,15 @@ define protected amdgpu_kernel void @InferFadd(i32 %a, ptr addrspace(1) %b, doub
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_ashr_i32 s7, s6, 31
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-NEXT: v_mov_b32_e32 v3, s3
; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3
; CHECK-NEXT: s_add_u32 s0, s0, s2
; CHECK-NEXT: s_addc_u32 s1, s1, s3
-; CHECK-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-NEXT: v_add_co_u32_e64 v2, vcc, -8, s0
-; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
-; CHECK-NEXT: flat_atomic_add_f64 v[2:3], v[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: v_add_co_u32_e64 v0, vcc, -8, s0
+; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_wbinvl1_vol
; CHECK-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir
index cf15466..c7767cb8 100644
--- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir
@@ -16,6 +16,14 @@
ret void
}
+ define amdgpu_kernel void @inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_imm_src2() #0 {
+ ret void
+ }
+
+ define amdgpu_kernel void @inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg() #0 {
+ ret void
+ }
+
attributes #0 = { "amdgpu-wave-limiter"="true" "amdgpu-waves-per-eu"="8,8" }
...
@@ -311,3 +319,173 @@ body: |
$agpr0 = COPY %0
...
+
+# Non-mac variant, src2 is an immediate.
+---
+name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_imm_src2
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 10
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_imm_src2
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
+ ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
+ ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr0_sgpr1
+ ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $vcc, $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: early-clobber renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: liveins: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17:0x00000000FFFFFFFF
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ S_NOP 0, implicit-def $agpr0
+ renamable $sgpr0 = S_MOV_B32 0
+ undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr1 = COPY renamable $sgpr0
+ %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
+ renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ %0.sub9:vreg_512_align2 = COPY %0.sub8
+
+ bb.1:
+ liveins: $vcc
+
+ %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ S_BRANCH %bb.2
+
+ bb.2:
+ ; No VGPRs available for %0
+ S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ S_ENDPGM 0
+
+...
+
+# Non-mac variant, src2 is the same VGPR, but a different subregister.
+---
+name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 10
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
+ ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
+ ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
+ ; CHECK-NEXT: renamable $vgpr18_vgpr19 = COPY killed renamable $sgpr0_sgpr1
+ ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $vcc, $vgpr18_vgpr19
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: renamable $vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31:0x00000000FFFFFFFF
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ S_NOP 0, implicit-def $agpr0
+ renamable $sgpr0 = S_MOV_B32 0
+ undef %0.sub8:vreg_1024_align2 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr1 = COPY renamable $sgpr0
+ %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
+ renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ %0.sub9:vreg_1024_align2 = COPY %0.sub8
+
+ bb.1:
+ liveins: $vcc
+
+ undef %0.sub0_sub1:vreg_1024_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ %0.sub16_sub17:vreg_1024_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ %0.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15:vreg_1024_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0.sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31, 0, 0, 0, implicit $mode, implicit $exec
+ S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ S_BRANCH %bb.2
+
+ bb.2:
+ ; No VGPRs available for %0
+ S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
index 8718401..b907c13 100644
--- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
@@ -970,3 +970,93 @@ body: |
S_ENDPGM 0
...
+
+---
+name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_same_subreg
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 10
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_same_subreg
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
+ ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
+ ; CHECK-NEXT: renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr0_sgpr1
+ ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ ; CHECK-NEXT: renamable $vgpr11 = COPY renamable $vgpr10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $vcc, $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ ; CHECK-NEXT: renamable $vgpr6_vgpr7_vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ ; CHECK-NEXT: renamable $vgpr10_vgpr11_vgpr12_vgpr13 = GLOBAL_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ ; CHECK-NEXT: renamable $vgpr14_vgpr15_vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: liveins: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33:0x00000000FFFFFFFF
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ S_NOP 0, implicit-def $agpr0
+ renamable $sgpr0 = S_MOV_B32 0
+ undef %0.sub8:vreg_1024_align2 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr1 = COPY renamable $sgpr0
+ %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
+ renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ %0.sub9:vreg_1024_align2 = COPY %0.sub8
+
+ bb.1:
+ liveins: $vcc
+
+ %0.sub0_sub1_sub2_sub3:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ %0.sub4_sub5_sub6_sub7:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ %0.sub8_sub9_sub10_sub11:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ %0.sub12_sub13_sub14_sub15:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ %0.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15:vreg_1024_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15, 0, 0, 0, implicit $mode, implicit $exec
+ S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ S_BRANCH %bb.2
+
+ bb.2:
+ ; No VGPRs available for %0
+ S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir
index 11de6c8..06c3da0 100644
--- a/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir
@@ -32,32 +32,14 @@
# CHECK-NEXT: undef [[SPLIT0:%[0-9]+]].sub2_sub3:av_512_align2 = COPY undef $vgpr2_vgpr3 {
# CHECK-NEXT: internal [[SPLIT0]].sub0:av_512_align2 = COPY undef $vgpr0
# CHECK-NEXT: }
-# CHECK-NEXT: undef [[SPLIT1:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT0]].sub2_sub3 {
-# CHECK-NEXT: internal [[SPLIT1]].sub0:av_512_align2 = COPY [[SPLIT0]].sub0
-# CHECK-NEXT: }
-# CHECK-NEXT: undef [[SPLIT2:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT1]].sub2_sub3 {
-# CHECK-NEXT: internal [[SPLIT2]].sub0:av_512_align2 = COPY [[SPLIT1]].sub0
-# CHECK-NEXT: }
-# CHECK-NEXT: SI_SPILL_AV512_SAVE [[SPLIT2]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.1, align 4, addrspace 5)
-# CHECK-NEXT: [[RESTORE1:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
-# CHECK-NEXT: undef [[SPLIT3:%[0-9]+]].sub0_sub1:av_512_align2 = COPY [[RESTORE1]].sub0_sub1
-# CHECK-NEXT: [[RESTORE2:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.1, align 4, addrspace 5)
-# CHECK-NEXT: undef [[SPLIT3:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[RESTORE2]].sub2_sub3 {
-# CHECK-NEXT: internal [[SPLIT3]].sub0:av_512_align2 = COPY [[RESTORE2]].sub0
-# CHECK-NEXT: }
-# CHECK-NEXT: undef [[SPLIT4:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT3]].sub2_sub3 {
-# CHECK-NEXT: internal [[SPLIT4]].sub0:av_512_align2 = COPY [[SPLIT3]].sub0
-# CHECK-NEXT: }
-# CHECK-NEXT: [[SPLIT5:%[0-9]+]].sub2:av_512_align2 = COPY [[SPLIT4]].sub3
-# CHECK-NEXT: undef [[SPLIT6:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT5]].sub0_sub1_sub2
-# CHECK-NEXT: undef [[SPLIT7:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT6]].sub0_sub1_sub2
-# CHECK-NEXT: undef [[SPLIT8:%[0-9]+]].sub0:av_512_align2 = COPY [[SPLIT4]].sub0 {
-# CHECK-NEXT: internal [[SPLIT8]].sub2:av_512_align2 = COPY [[SPLIT4]].sub2
+# CHECK-NEXT: undef [[SPLIT2:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT0]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT2]].sub0:av_512_align2 = COPY [[SPLIT0]].sub0
# CHECK-NEXT: }
-# CHECK-NEXT: [[SPLIT9:%[0-9]+]].sub3:av_512_align2 = COPY [[SPLIT8]].sub2
-# CHECK-NEXT: undef [[SPLIT10:%[0-9]+]].sub0_sub1_sub2_sub3:av_512_align2 = COPY [[SPLIT9]].sub0_sub1_sub2_sub3
-# CHECK-NEXT: undef [[SPLIT13:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_512_align2 = COPY [[SPLIT10]].sub0_sub1_sub2_sub3
-# CHECK-NEXT: [[MFMA_USE1:%[0-9]+]].sub4:vreg_512_align2 = COPY [[SPLIT8]].sub0
+# CHECK-NEXT: [[RESTORE2:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
+# CHECK-NEXT: [[MFMA_USE1:%[0-9]+]].sub0_sub1:vreg_512_align2 = COPY [[RESTORE2]].sub0_sub1
+# CHECK-NEXT: [[MFMA_USE1]].sub2:vreg_512_align2 = COPY [[SPLIT2]].sub3
+# CHECK-NEXT: [[MFMA_USE1]].sub3:vreg_512_align2 = COPY [[SPLIT2]].sub2
+# CHECK-NEXT: [[MFMA_USE1]].sub4:vreg_512_align2 = COPY [[SPLIT2]].sub0
# CHECK-NEXT: [[MFMA_USE1]].sub5:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
# CHECK-NEXT: [[MFMA_USE1]].sub6:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
# CHECK-NEXT: [[MFMA_USE1]].sub7:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
index 1ac75d3..d8c983a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
@@ -1331,16 +1331,16 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX942-NEXT: s_load_dword s6, s[4:5], 0x10
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v8, 5, v0
-; GFX942-NEXT: v_mov_b32_e32 v9, 0x5040100
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 5, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0x5040100
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3]
-; GFX942-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
+; GFX942-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v4, s[2:3] offset:16
; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: v_perm_b32 v1, s6, v1, v9
+; GFX942-NEXT: v_perm_b32 v1, s6, v1, v5
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
diff --git a/llvm/test/CodeGen/AMDGPU/large-avgpr-assign-last.mir b/llvm/test/CodeGen/AMDGPU/large-avgpr-assign-last.mir
new file mode 100644
index 0000000..58e9b0a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/large-avgpr-assign-last.mir
@@ -0,0 +1,94 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -verify-regalloc -greedy-regclass-priority-trumps-globalness=1 -start-after=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck %s
+
+--- |
+ define void @temp_vgpr_to_agpr_should_not_undo_split_with_remat() #0 {
+ entry:
+ unreachable
+ }
+
+ attributes #0 = { "amdgpu-agpr-alloc"="0,0" }
+...
+
+
+---
+name: temp_vgpr_to_agpr_should_not_undo_split_with_remat
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
+ stackPtrOffsetReg: '$sgpr32'
+ argumentInfo:
+ privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ workGroupIDX: { reg: '$sgpr6' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+ workItemIDX: { reg: '$vgpr0' }
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr4_sgpr5
+ ; CHECK-LABEL: name: temp_vgpr_to_agpr_should_not_undo_split_with_remat
+ ; CHECK: liveins: $vgpr0, $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+ ; CHECK-NEXT: dead renamable $vgpr1 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr1 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr2 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr3 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr4 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr5 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr6 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr7 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr8 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr9 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr10 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr11 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr12 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr13 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr14 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr15 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr16 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr17 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr18 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr19 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr20 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr21 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $vgpr22 = IMPLICIT_DEF
+ ; CHECK-NEXT: KILL killed renamable $vgpr2, killed renamable $vgpr3, killed renamable $vgpr4, killed renamable $vgpr5, killed renamable $vgpr6, killed renamable $vgpr7, killed renamable $vgpr8, killed renamable $vgpr9, killed renamable $vgpr10, killed renamable $vgpr11, killed renamable $vgpr12, killed renamable $vgpr13, killed renamable $vgpr14, killed renamable $vgpr15, killed renamable $vgpr16
+ ; CHECK-NEXT: S_NOP 0, implicit-def renamable $vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38
+ ; CHECK-NEXT: S_NOP 0, implicit-def renamable $vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54
+ ; CHECK-NEXT: KILL killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr17, killed renamable $vgpr18, killed renamable $vgpr19, killed renamable $vgpr20, killed renamable $vgpr21, killed renamable $vgpr22
+ ; CHECK-NEXT: S_NOP 0, implicit killed renamable $vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38, implicit killed renamable $vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54
+ ; CHECK-NEXT: S_ENDPGM 0
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = IMPLICIT_DEF
+ %4:vgpr_32 = IMPLICIT_DEF
+ %5:vgpr_32 = IMPLICIT_DEF
+ %6:vgpr_32 = IMPLICIT_DEF
+ %7:vgpr_32 = IMPLICIT_DEF
+ %8:vgpr_32 = IMPLICIT_DEF
+ %9:vgpr_32 = IMPLICIT_DEF
+ %10:vgpr_32 = IMPLICIT_DEF
+ %11:vgpr_32 = IMPLICIT_DEF
+ %12:vgpr_32 = IMPLICIT_DEF
+ %13:vgpr_32 = IMPLICIT_DEF
+ %14:vgpr_32 = IMPLICIT_DEF
+ %15:vgpr_32 = IMPLICIT_DEF
+ %16:vgpr_32 = IMPLICIT_DEF
+ %17:vgpr_32 = IMPLICIT_DEF
+ %18:vgpr_32 = IMPLICIT_DEF
+ %19:vgpr_32 = IMPLICIT_DEF
+ %20:vgpr_32 = IMPLICIT_DEF
+ %21:vgpr_32 = IMPLICIT_DEF
+ %22:vgpr_32 = IMPLICIT_DEF
+ %23:vgpr_32 = IMPLICIT_DEF
+ KILL %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, %16, %17
+ S_NOP 0, implicit-def %50:av_512
+ S_NOP 0, implicit-def %51:av_512
+ KILL %1, %2, %18, %19, %20, %21, %22, %23
+ S_NOP 0, implicit %50, implicit %51
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll
new file mode 100644
index 0000000..89555d3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
+
+declare void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 %col)
+
+define amdgpu_ps void @flat_prefetch(ptr %ptr) {
+; GCN-LABEL: flat_prefetch:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1]
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_sgpr(ptr inreg %ptr) {
+; GCN-LABEL: flat_prefetch_sgpr:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: flat_prefetch_b8 v0, s[0:1]
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_offset(ptr %ptr) {
+; GCN-LABEL: flat_prefetch_offset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1] offset:512
+; GCN-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i32, ptr %ptr, i32 128
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %gep, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_sgpr_voffset(ptr inreg %ptr, i32 %offset) {
+; GCN-LABEL: flat_prefetch_sgpr_voffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v0, s[0:1]
+; GCN-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr %ptr, i32 %offset
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %gep, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_sgpr_voffset_offset(ptr inreg %ptr, i32 %offset) {
+; GCN-LABEL: flat_prefetch_sgpr_voffset_offset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v0, s[0:1] offset:128
+; GCN-NEXT: s_endpgm
+entry:
+ %gep1 = getelementptr i8, ptr %ptr, i32 %offset
+ %gep2 = getelementptr i8, ptr %gep1, i32 128
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %gep2, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_se(ptr %ptr) {
+; GCN-LABEL: flat_prefetch_se:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SE
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 8)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_se_nt(ptr %ptr) {
+; GCN-LABEL: flat_prefetch_se_nt:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1] th:TH_LOAD_NT scope:SCOPE_SE
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 9)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_dev_ht(ptr %ptr) {
+; GCN-LABEL: flat_prefetch_dev_ht:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1] th:TH_LOAD_HT scope:SCOPE_DEV
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 18)
+ ret void
+}
+
+define amdgpu_ps void @flat_prefetch_sys_lu(ptr %ptr) {
+; GCN-LABEL: flat_prefetch_sys_lu:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_prefetch_b8 v[0:1] th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 27)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll
new file mode 100644
index 0000000..047a6cc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
+
+declare void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 %col)
+
+define amdgpu_ps void @global_prefetch(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_prefetch:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v[0:1], off
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_sgpr(ptr addrspace(1) inreg %ptr) {
+; GCN-LABEL: global_prefetch_sgpr:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: global_prefetch_b8 v0, s[0:1]
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_offset(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_prefetch_offset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v[0:1], off offset:512
+; GCN-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 128
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %gep, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_sgpr_voffset(ptr addrspace(1) inreg %ptr, i32 %offset) {
+; GCN-LABEL: global_prefetch_sgpr_voffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v0, s[0:1]
+; GCN-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(1) %ptr, i32 %offset
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %gep, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_sgpr_voffset_offset(ptr addrspace(1) inreg %ptr, i32 %offset) {
+; GCN-LABEL: global_prefetch_sgpr_voffset_offset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v0, s[0:1] offset:128
+; GCN-NEXT: s_endpgm
+entry:
+ %gep1 = getelementptr i8, ptr addrspace(1) %ptr, i32 %offset
+ %gep2 = getelementptr i8, ptr addrspace(1) %gep1, i32 128
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %gep2, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_se(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_prefetch_se:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v[0:1], off scope:SCOPE_SE
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 8)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_se_nt(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_prefetch_se_nt:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v[0:1], off th:TH_LOAD_NT scope:SCOPE_SE
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 9)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_dev_ht(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_prefetch_dev_ht:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v[0:1], off th:TH_LOAD_HT scope:SCOPE_DEV
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 18)
+ ret void
+}
+
+define amdgpu_ps void @global_prefetch_sys_lu(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_prefetch_sys_lu:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_prefetch_b8 v[0:1], off th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 27)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll
index 85dd275..fcdad53 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll
@@ -4,30 +4,30 @@
define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(<1 x i64> %L1) {
; GCN-LABEL: test_iglp_opt_rev_mfma_gemm:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: v_mov_b32_e32 v32, 0
-; GCN-NEXT: ds_read_b128 v[0:3], v32
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: ds_read_b128 v[2:5], v0
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GCN-NEXT: ds_read_b128 v[28:31], v32 offset:112
-; GCN-NEXT: ds_read_b128 v[24:27], v32 offset:96
-; GCN-NEXT: ds_read_b128 v[20:23], v32 offset:80
-; GCN-NEXT: ds_read_b128 v[16:19], v32 offset:64
-; GCN-NEXT: ds_read_b128 v[4:7], v32 offset:16
-; GCN-NEXT: ds_read_b128 v[8:11], v32 offset:32
-; GCN-NEXT: ds_read_b128 v[12:15], v32 offset:48
+; GCN-NEXT: ds_read_b128 v[30:33], v0 offset:112
+; GCN-NEXT: ds_read_b128 v[26:29], v0 offset:96
+; GCN-NEXT: ds_read_b128 v[22:25], v0 offset:80
+; GCN-NEXT: ds_read_b128 v[18:21], v0 offset:64
+; GCN-NEXT: ds_read_b128 v[6:9], v0 offset:16
+; GCN-NEXT: ds_read_b128 v[10:13], v0 offset:32
+; GCN-NEXT: ds_read_b128 v[14:17], v0 offset:48
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: ds_write_b128 v32, v[0:3]
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: v_mov_b32_e32 v1, v0
+; GCN-NEXT: ds_write_b128 v0, v[2:5]
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: v_mov_b32_e32 v3, v2
; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0
; GCN-NEXT: ; iglp_opt mask(0x00000001)
-; GCN-NEXT: ds_write_b128 v32, v[28:31] offset:112
-; GCN-NEXT: ds_write_b128 v32, v[24:27] offset:96
-; GCN-NEXT: ds_write_b128 v32, v[20:23] offset:80
-; GCN-NEXT: ds_write_b128 v32, v[16:19] offset:64
-; GCN-NEXT: ds_write_b128 v32, v[12:15] offset:48
-; GCN-NEXT: ds_write_b128 v32, v[8:11] offset:32
-; GCN-NEXT: ds_write_b128 v32, v[4:7] offset:16
-; GCN-NEXT: ds_write_b64 v32, v[0:1]
+; GCN-NEXT: ds_write_b128 v0, v[30:33] offset:112
+; GCN-NEXT: ds_write_b128 v0, v[26:29] offset:96
+; GCN-NEXT: ds_write_b128 v0, v[22:25] offset:80
+; GCN-NEXT: ds_write_b128 v0, v[18:21] offset:64
+; GCN-NEXT: ds_write_b128 v0, v[14:17] offset:48
+; GCN-NEXT: ds_write_b128 v0, v[10:13] offset:32
+; GCN-NEXT: ds_write_b128 v0, v[6:9] offset:16
+; GCN-NEXT: ds_write_b64 v0, v[2:3]
; GCN-NEXT: s_endpgm
entry:
call void @llvm.amdgcn.iglp.opt(i32 1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
index ed7d88b..dcac419 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
@@ -18,19 +18,22 @@ define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1
; GCN-LABEL: load_1d_lwe:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: v_mov_b32_e32 v8, 0
-; GCN-NEXT: v_mov_b32_e32 v6, v0
; GCN-NEXT: v_mov_b32_e32 v9, v8
; GCN-NEXT: v_mov_b32_e32 v10, v8
; GCN-NEXT: v_mov_b32_e32 v11, v8
; GCN-NEXT: v_mov_b32_e32 v12, v8
-; GCN-NEXT: v_mov_b32_e32 v0, v8
-; GCN-NEXT: v_mov_b32_e32 v1, v9
-; GCN-NEXT: v_mov_b32_e32 v2, v10
-; GCN-NEXT: v_mov_b32_e32 v3, v11
-; GCN-NEXT: v_mov_b32_e32 v4, v12
-; GCN-NEXT: image_load v[0:4], v6, s[0:7] dmask:0xf unorm lwe
+; GCN-NEXT: v_mov_b32_e32 v2, v8
+; GCN-NEXT: v_mov_b32_e32 v3, v9
+; GCN-NEXT: v_mov_b32_e32 v4, v10
+; GCN-NEXT: v_mov_b32_e32 v5, v11
+; GCN-NEXT: v_mov_b32_e32 v6, v12
+; GCN-NEXT: image_load v[2:6], v0, s[0:7] dmask:0xf unorm lwe
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dword v8, v4, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v0, v2
+; GCN-NEXT: v_mov_b32_e32 v1, v3
+; GCN-NEXT: v_mov_b32_e32 v2, v4
+; GCN-NEXT: v_mov_b32_e32 v3, v5
+; GCN-NEXT: global_store_dword v8, v6, s[8:9]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
main_body:
@@ -75,6 +78,27 @@ main_body:
}
define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice) {
+; GCN-LABEL: load_cube_lwe:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: v_mov_b32_e32 v10, 0
+; GCN-NEXT: v_mov_b32_e32 v11, v10
+; GCN-NEXT: v_mov_b32_e32 v12, v10
+; GCN-NEXT: v_mov_b32_e32 v13, v10
+; GCN-NEXT: v_mov_b32_e32 v14, v10
+; GCN-NEXT: v_mov_b32_e32 v4, v10
+; GCN-NEXT: v_mov_b32_e32 v5, v11
+; GCN-NEXT: v_mov_b32_e32 v6, v12
+; GCN-NEXT: v_mov_b32_e32 v7, v13
+; GCN-NEXT: v_mov_b32_e32 v8, v14
+; GCN-NEXT: image_load v[4:8], v[0:2], s[0:7] dmask:0xf unorm lwe da
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, v4
+; GCN-NEXT: v_mov_b32_e32 v1, v5
+; GCN-NEXT: v_mov_b32_e32 v2, v6
+; GCN-NEXT: v_mov_b32_e32 v3, v7
+; GCN-NEXT: global_store_dword v10, v8, s[8:9]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
%v.vec = extractvalue {<4 x float>, i32} %v, 0
@@ -106,6 +130,27 @@ main_body:
}
define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice) {
+; GCN-LABEL: load_2darray_lwe:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: v_mov_b32_e32 v10, 0
+; GCN-NEXT: v_mov_b32_e32 v11, v10
+; GCN-NEXT: v_mov_b32_e32 v12, v10
+; GCN-NEXT: v_mov_b32_e32 v13, v10
+; GCN-NEXT: v_mov_b32_e32 v14, v10
+; GCN-NEXT: v_mov_b32_e32 v4, v10
+; GCN-NEXT: v_mov_b32_e32 v5, v11
+; GCN-NEXT: v_mov_b32_e32 v6, v12
+; GCN-NEXT: v_mov_b32_e32 v7, v13
+; GCN-NEXT: v_mov_b32_e32 v8, v14
+; GCN-NEXT: image_load v[4:8], v[0:2], s[0:7] dmask:0xf unorm lwe da
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, v4
+; GCN-NEXT: v_mov_b32_e32 v1, v5
+; GCN-NEXT: v_mov_b32_e32 v2, v6
+; GCN-NEXT: v_mov_b32_e32 v3, v7
+; GCN-NEXT: global_store_dword v10, v8, s[8:9]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
%v.vec = extractvalue {<4 x float>, i32} %v, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll
new file mode 100644
index 0000000..017d402
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll
@@ -0,0 +1,201 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+declare i32 @llvm.amdgcn.global.load.monitor.b32.i32(ptr addrspace(1), i32)
+declare <2 x i32> @llvm.amdgcn.global.load.monitor.b64.v2i32(ptr addrspace(1), i32)
+declare <4 x i32> @llvm.amdgcn.global.load.monitor.b128.v4i32(ptr addrspace(1), i32)
+declare i32 @llvm.amdgcn.flat.load.monitor.b32.i32(ptr, i32)
+declare <2 x i32> @llvm.amdgcn.flat.load.monitor.b64.v2i32(ptr, i32)
+declare <4 x i32> @llvm.amdgcn.flat.load.monitor.b128.v4i32(ptr, i32)
+
+define amdgpu_ps void @global_load_monitor_b32_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_monitor_b32_vaddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_load_monitor_b32 v0, v[0:1], off offset:32 th:TH_LOAD_NT
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v[2:3], v0, off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call i32 @llvm.amdgcn.global.load.monitor.b32.i32(ptr addrspace(1) %gep, i32 1)
+ store i32 %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @global_load_monitor_b32_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_monitor_b32_saddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_load_monitor_b32 v2, v2, s[0:1] offset:32 th:TH_LOAD_HT scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v[0:1], v2, off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call i32 @llvm.amdgcn.global.load.monitor.b32.i32(ptr addrspace(1) %gep, i32 10)
+ store i32 %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @global_load_monitor_b64_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_monitor_b64_vaddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_load_monitor_b64 v[0:1], v[0:1], off offset:32 th:TH_LOAD_NT_HT scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <2 x i32> @llvm.amdgcn.global.load.monitor.b64.v2i32(ptr addrspace(1) %gep, i32 22)
+ store <2 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @global_load_monitor_b64_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_monitor_b64_saddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_load_monitor_b64 v[2:3], v2, s[0:1] offset:32 th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <2 x i32> @llvm.amdgcn.global.load.monitor.b64.v2i32(ptr addrspace(1) %gep, i32 27)
+ store <2 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @global_load_monitor_b128_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_monitor_b128_vaddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_load_monitor_b128 v[4:7], v[0:1], off offset:32
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <4 x i32> @llvm.amdgcn.global.load.monitor.b128.v4i32(ptr addrspace(1) %gep, i32 0)
+ store <4 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @global_load_monitor_b128_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_monitor_b128_saddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_load_monitor_b128 v[2:5], v2, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <4 x i32> @llvm.amdgcn.global.load.monitor.b128.v4i32(ptr addrspace(1) %gep, i32 1)
+ store <4 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @flat_load_monitor_b32(ptr %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: flat_load_monitor_b32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: flat_load_monitor_b32 v0, v[0:1] offset:32 th:TH_LOAD_HT scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_store_b32 v[2:3], v0, off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(0) %addr, i32 4
+ %val = call i32 @llvm.amdgcn.flat.load.monitor.b32.i32(ptr addrspace(0) %gep, i32 10)
+ store i32 %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @flat_load_monitor_b64(ptr %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: flat_load_monitor_b64:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: flat_load_monitor_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT_HT scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(0) %addr, i32 4
+ %val = call <2 x i32> @llvm.amdgcn.flat.load.monitor.b64.v2i32(ptr addrspace(0) %gep, i32 22)
+ store <2 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @flat_load_monitor_b128(ptr %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: flat_load_monitor_b128:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: flat_load_monitor_b128 v[4:7], v[0:1] offset:32 th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(0) %addr, i32 4
+ %val = call <4 x i32> @llvm.amdgcn.flat.load.monitor.b128.v4i32(ptr addrspace(0) %gep, i32 27)
+ store <4 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @global_load_monitor_b32_saddr_scale_offset(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 %idx) {
+; GFX1250-LABEL: global_load_monitor_b32_saddr_scale_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_load_monitor_b32 v2, v2, s[0:1] scale_offset th:TH_LOAD_NT
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v[0:1], v2, off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %idxprom = sext i32 %idx to i64
+ %gep = getelementptr i32, ptr addrspace(1) %addr, i64 %idxprom
+ %val = call i32 @llvm.amdgcn.global.load.monitor.b32.i32(ptr addrspace(1) %gep, i32 1)
+ store i32 %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @global_load_monitor_b64_saddr_scale_offset(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 %idx) {
+; GFX1250-LABEL: global_load_monitor_b64_saddr_scale_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_load_monitor_b64 v[2:3], v2, s[0:1] scale_offset th:TH_LOAD_NT
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %idxprom = sext i32 %idx to i64
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i64 %idxprom
+ %val = call <2 x i32> @llvm.amdgcn.global.load.monitor.b64.v2i32(ptr addrspace(1) %gep, i32 1)
+ store <2 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @global_load_monitor_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 %idx) {
+; GFX1250-SDAG-LABEL: global_load_monitor_b64_saddr_no_scale_offset:
+; GFX1250-SDAG: ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1]
+; GFX1250-SDAG-NEXT: global_load_monitor_b64 v[2:3], v[2:3], off th:TH_LOAD_NT
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: global_load_monitor_b64_saddr_no_scale_offset:
+; GFX1250-GISEL: ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
+; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v5, v3, vcc_lo
+; GFX1250-GISEL-NEXT: global_load_monitor_b64 v[2:3], v[2:3], off th:TH_LOAD_NT
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1250-GISEL-NEXT: s_endpgm
+entry:
+ %idxprom = sext i32 %idx to i64
+ %gep = getelementptr i32, ptr addrspace(1) %addr, i64 %idxprom
+ %val = call <2 x i32> @llvm.amdgcn.global.load.monitor.b64.v2i32(ptr addrspace(1) %gep, i32 1)
+ store <2 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
index 1585a2c..303ea50 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
@@ -1,6 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefixes=GCN,GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck --check-prefixes=GCN,GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x float>, i32, i32, i32)
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
@@ -9,50 +10,199 @@ declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16>, <2 x i16>, <16
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32)
declare i32 @llvm.amdgcn.workitem.id.x()
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x2bf16:
-; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
-; GCN-DAG: s_load_dwordx16
-; GCN-DAG: s_load_dwordx16
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX90A-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GCN: v_mfma_f32_32x32x2bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GFX908-COUNT-32: v_accvgpr_read_b32
-; GFX908: global_store_dwordx4
-; GFX90A-NOT: v_accvgpr_read_b32
-; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}],
define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_f32_32x32x2bf16:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX908-NEXT: v_mov_b32_e32 v4, 0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX908-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v0, s16
+; GFX908-NEXT: v_mov_b32_e32 v1, s17
+; GFX908-NEXT: v_mov_b32_e32 v2, s18
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s21
+; GFX908-NEXT: v_mov_b32_e32 v1, s22
+; GFX908-NEXT: v_mov_b32_e32 v2, s23
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s24
+; GFX908-NEXT: v_mov_b32_e32 v1, s25
+; GFX908-NEXT: v_mov_b32_e32 v2, s26
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s27
+; GFX908-NEXT: v_mov_b32_e32 v1, s28
+; GFX908-NEXT: v_mov_b32_e32 v2, s29
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s30
+; GFX908-NEXT: v_mov_b32_e32 v1, s31
+; GFX908-NEXT: v_mov_b32_e32 v2, s0
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s1
+; GFX908-NEXT: v_mov_b32_e32 v1, s2
+; GFX908-NEXT: v_mov_b32_e32 v2, s3
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s4
+; GFX908-NEXT: v_mov_b32_e32 v1, s5
+; GFX908-NEXT: v_mov_b32_e32 v2, s6
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s7
+; GFX908-NEXT: v_mov_b32_e32 v1, s8
+; GFX908-NEXT: v_mov_b32_e32 v2, s9
+; GFX908-NEXT: v_mov_b32_e32 v3, s19
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s10
+; GFX908-NEXT: v_mov_b32_e32 v1, s11
+; GFX908-NEXT: v_mov_b32_e32 v2, s12
+; GFX908-NEXT: v_mov_b32_e32 v5, s20
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s13
+; GFX908-NEXT: v_mov_b32_e32 v1, s14
+; GFX908-NEXT: v_mov_b32_e32 v2, s15
+; GFX908-NEXT: v_mov_b32_e32 v3, 1
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a31, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, 2
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_32x32x2bf16 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: test_mfma_f32_32x32x2bf16:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v1, 1
+; GFX90A-NEXT: v_mov_b32_e32 v2, 2
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, s16
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, s17
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, s23
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, s24
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, s25
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, s26
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, s27
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, s28
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, s29
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, s30
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, s31
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, s3
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, s4
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, s5
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, s6
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, s7
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, s8
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, s9
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, s10
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, s11
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, s12
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f32_32x32x2bf16 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 2
+; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
+; GFX90A-NEXT: s_endpgm
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%a = bitcast i32 1 to <2 x i16>
@@ -62,18 +212,109 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_16x16x2bf16:
-; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
-; GCN-DAG: s_load_dwordx16
-; GFX908-DAG-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX90A-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GCN: v_mfma_f32_16x16x2bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GFX908-COUNT-16: v_accvgpr_read_b32
-; GFX908: global_store_dwordx4
-; GFX90A-NOT: v_accvgpr_read_b32
-; GFX90A-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}],
define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_f32_16x16x2bf16:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX908-NEXT: v_mov_b32_e32 v0, 1
+; GFX908-NEXT: v_mov_b32_e32 v12, 0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v13, s0
+; GFX908-NEXT: v_mov_b32_e32 v1, s1
+; GFX908-NEXT: v_mov_b32_e32 v2, s2
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v13
+; GFX908-NEXT: v_mov_b32_e32 v13, s3
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v13
+; GFX908-NEXT: v_mov_b32_e32 v1, s4
+; GFX908-NEXT: v_mov_b32_e32 v2, s5
+; GFX908-NEXT: v_mov_b32_e32 v13, s6
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v13
+; GFX908-NEXT: v_mov_b32_e32 v1, s7
+; GFX908-NEXT: v_mov_b32_e32 v2, s8
+; GFX908-NEXT: v_mov_b32_e32 v13, s9
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v13
+; GFX908-NEXT: v_mov_b32_e32 v1, s10
+; GFX908-NEXT: v_mov_b32_e32 v2, s11
+; GFX908-NEXT: v_mov_b32_e32 v13, s12
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v13
+; GFX908-NEXT: v_mov_b32_e32 v1, s13
+; GFX908-NEXT: v_mov_b32_e32 v2, s14
+; GFX908-NEXT: v_mov_b32_e32 v13, s15
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v13
+; GFX908-NEXT: v_mov_b32_e32 v1, 2
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a4
+; GFX908-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48
+; GFX908-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32
+; GFX908-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: test_mfma_f32_16x16x2bf16:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v0, 1
+; GFX90A-NEXT: v_mov_b32_e32 v1, 2
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX90A-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%a = bitcast i32 1 to <2 x i16>
@@ -83,18 +324,53 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_4x4x2bf16:
-; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
-; GCN: s_load_dwordx4
-; GFX908-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX90A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GCN: v_mfma_f32_4x4x2bf16 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GFX908-COUNT-4: v_accvgpr_read_b32
-; GFX908: global_store_dwordx4
-; GFX90A-NOT: v_accvgpr_read_b32
-; GFX90A: global_store_dwordx4 v{{[0-9]+}}, [[RES]],
define amdgpu_kernel void @test_mfma_f32_4x4x2bf16(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_f32_4x4x2bf16:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX908-NEXT: v_mov_b32_e32 v0, 1
+; GFX908-NEXT: v_mov_b32_e32 v1, 2
+; GFX908-NEXT: v_mov_b32_e32 v4, 0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v5, s0
+; GFX908-NEXT: v_mov_b32_e32 v2, s1
+; GFX908-NEXT: v_mov_b32_e32 v3, s2
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v5
+; GFX908-NEXT: v_mov_b32_e32 v5, s3
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v5
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_mfma_f32_4x4x2bf16 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
+; GFX908-NEXT: s_nop 3
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: test_mfma_f32_4x4x2bf16:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v0, 1
+; GFX90A-NEXT: v_mov_b32_e32 v2, 2
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f32_4x4x2bf16 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: s_nop 4
+; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX90A-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%a = bitcast i32 1 to <2 x i16>
@@ -104,18 +380,110 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16:
-; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
-; GCN-DAG: s_load_dwordx16
-; GFX908-DAG-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX90A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GCN: v_mfma_f32_32x32x4bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GFX908-COUNT-16: v_accvgpr_read_b32
-; GFX908: global_store_dwordx4
-; GFX90A-NOT: v_accvgpr_read_b32
-; GFX90A-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}],
define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_f32_32x32x4bf16:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX908-NEXT: v_mov_b32_e32 v0, 1
+; GFX908-NEXT: v_mov_b32_e32 v16, 0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v17, s0
+; GFX908-NEXT: v_mov_b32_e32 v1, s1
+; GFX908-NEXT: v_mov_b32_e32 v2, s2
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v17
+; GFX908-NEXT: v_mov_b32_e32 v17, s3
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v17
+; GFX908-NEXT: v_mov_b32_e32 v1, s4
+; GFX908-NEXT: v_mov_b32_e32 v2, s5
+; GFX908-NEXT: v_mov_b32_e32 v17, s6
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v17
+; GFX908-NEXT: v_mov_b32_e32 v1, s7
+; GFX908-NEXT: v_mov_b32_e32 v2, s8
+; GFX908-NEXT: v_mov_b32_e32 v17, s9
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v17
+; GFX908-NEXT: v_mov_b32_e32 v1, s10
+; GFX908-NEXT: v_mov_b32_e32 v2, s11
+; GFX908-NEXT: v_mov_b32_e32 v17, s12
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v17
+; GFX908-NEXT: v_mov_b32_e32 v1, s13
+; GFX908-NEXT: v_mov_b32_e32 v2, s14
+; GFX908-NEXT: v_mov_b32_e32 v17, s15
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v17
+; GFX908-NEXT: v_mov_b32_e32 v1, 2
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a0
+; GFX908-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48
+; GFX908-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32
+; GFX908-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16
+; GFX908-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17]
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: test_mfma_f32_32x32x4bf16:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v0, 1
+; GFX90A-NEXT: v_mov_b32_e32 v1, 2
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX90A-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%a = bitcast i32 1 to <2 x i16>
@@ -125,18 +493,55 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_16x16x8bf16:
-; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
-; GCN: s_load_dwordx4
-; GFX908-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX90A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GCN: v_mfma_f32_16x16x8bf16 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GFX908-COUNT-4: v_accvgpr_read_b32
-; GFX908: global_store_dwordx4
-; GFX90A-NOT: v_accvgpr_read_b32
-; GFX90A: global_store_dwordx4 v{{[0-9]+}}, [[RES]],
define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_f32_16x16x8bf16:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX908-NEXT: v_mov_b32_e32 v0, 1
+; GFX908-NEXT: v_mov_b32_e32 v1, 2
+; GFX908-NEXT: v_mov_b32_e32 v4, 0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v5, s0
+; GFX908-NEXT: v_mov_b32_e32 v2, s1
+; GFX908-NEXT: v_mov_b32_e32 v3, s2
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v5
+; GFX908-NEXT: v_mov_b32_e32 v5, s3
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v5
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: test_mfma_f32_16x16x8bf16:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v0, 1
+; GFX90A-NEXT: v_mov_b32_e32 v2, 2
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f32_16x16x8bf16 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 2
+; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX90A-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%a = bitcast i32 1 to <2 x i16>
@@ -147,3 +552,5 @@ bb:
}
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index 4c26961..ff77d5cc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -1,5 +1,8 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefixes=VGPR,GFX90A-VGPR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefixes=VGPR,GFX942-VGPR %s
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32)
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
@@ -10,17 +13,238 @@ declare <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double, double, <4 x doubl
declare double @llvm.amdgcn.mfma.f64.4x4x4f64(double, double, double, i32, i32, i32)
declare i32 @llvm.amdgcn.workitem.id.x()
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16_1k:
-; GCN-DAG: s_load_dwordx16
-; GCN-DAG: s_load_dwordx16
-; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX90A: v_mfma_f32_32x32x4bf16_1k a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GFX942: v_mfma_f32_32x32x4_2b_bf16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT: v_accvgpr_read_b32
-; GCN-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #0 {
+; GFX90A-LABEL: test_mfma_f32_32x32x4bf16_1k:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: v_mov_b32_e32 v0, 2
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, s16
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, s17
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, s23
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, s24
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, s25
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, s26
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, s27
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, s28
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, s29
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, s30
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, s31
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, s3
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, s4
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, s5
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, s6
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, s7
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, s8
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, s9
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, s10
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, s11
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, s12
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f32_32x32x4bf16_1k a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 2
+; GFX90A-NEXT: global_store_dwordx4 v1, a[24:27], s[34:35] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v1, a[28:31], s[34:35] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v1, a[16:19], s[34:35] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v1, a[20:23], s[34:35] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v1, a[8:11], s[34:35] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v1, a[12:15], s[34:35] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[34:35]
+; GFX90A-NEXT: global_store_dwordx4 v1, a[4:7], s[34:35] offset:16
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_f32_32x32x4bf16_1k:
+; GFX942: ; %bb.0: ; %bb
+; GFX942-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 1
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX942-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_accvgpr_write_b32 a0, s16
+; GFX942-NEXT: v_accvgpr_write_b32 a1, s17
+; GFX942-NEXT: v_accvgpr_write_b32 a2, s18
+; GFX942-NEXT: v_accvgpr_write_b32 a3, s19
+; GFX942-NEXT: v_accvgpr_write_b32 a4, s20
+; GFX942-NEXT: v_accvgpr_write_b32 a5, s21
+; GFX942-NEXT: v_accvgpr_write_b32 a6, s22
+; GFX942-NEXT: v_accvgpr_write_b32 a7, s23
+; GFX942-NEXT: v_accvgpr_write_b32 a8, s24
+; GFX942-NEXT: v_accvgpr_write_b32 a9, s25
+; GFX942-NEXT: v_accvgpr_write_b32 a10, s26
+; GFX942-NEXT: v_accvgpr_write_b32 a11, s27
+; GFX942-NEXT: v_accvgpr_write_b32 a12, s28
+; GFX942-NEXT: v_accvgpr_write_b32 a13, s29
+; GFX942-NEXT: v_accvgpr_write_b32 a14, s30
+; GFX942-NEXT: v_accvgpr_write_b32 a15, s31
+; GFX942-NEXT: v_accvgpr_write_b32 a16, s0
+; GFX942-NEXT: v_accvgpr_write_b32 a17, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a18, s2
+; GFX942-NEXT: v_accvgpr_write_b32 a19, s3
+; GFX942-NEXT: v_accvgpr_write_b32 a20, s4
+; GFX942-NEXT: v_accvgpr_write_b32 a21, s5
+; GFX942-NEXT: v_accvgpr_write_b32 a22, s6
+; GFX942-NEXT: v_accvgpr_write_b32 a23, s7
+; GFX942-NEXT: v_accvgpr_write_b32 a24, s8
+; GFX942-NEXT: v_accvgpr_write_b32 a25, s9
+; GFX942-NEXT: v_accvgpr_write_b32 a26, s10
+; GFX942-NEXT: v_accvgpr_write_b32 a27, s11
+; GFX942-NEXT: v_accvgpr_write_b32 a28, s12
+; GFX942-NEXT: v_accvgpr_write_b32 a29, s13
+; GFX942-NEXT: v_accvgpr_write_b32 a30, s14
+; GFX942-NEXT: v_accvgpr_write_b32 a31, s15
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mfma_f32_32x32x4_2b_bf16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 2
+; GFX942-NEXT: global_store_dwordx4 v1, a[24:27], s[34:35] offset:96
+; GFX942-NEXT: global_store_dwordx4 v1, a[28:31], s[34:35] offset:112
+; GFX942-NEXT: global_store_dwordx4 v1, a[16:19], s[34:35] offset:64
+; GFX942-NEXT: global_store_dwordx4 v1, a[20:23], s[34:35] offset:80
+; GFX942-NEXT: global_store_dwordx4 v1, a[8:11], s[34:35] offset:32
+; GFX942-NEXT: global_store_dwordx4 v1, a[12:15], s[34:35] offset:48
+; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[34:35]
+; GFX942-NEXT: global_store_dwordx4 v1, a[4:7], s[34:35] offset:16
+; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f32_32x32x4bf16_1k:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v33, 0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v34, 1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v35, v33
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v32, 2
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX90A-VGPR-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, s16
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, s17
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, s18
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, s19
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, s20
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, s21
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, s22
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, s23
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, s24
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v9, s25
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s26
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s27
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v12, s28
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v13, s29
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v14, s30
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v15, s31
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, s0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v17, s1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v18, s2
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v19, s3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v20, s4
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v21, s5
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v22, s6
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v23, s7
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v24, s8
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v25, s9
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v26, s10
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v27, s11
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v28, s12
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v29, s13
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v30, s14
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v31, s15
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f32_32x32x4bf16_1k v[0:31], v[34:35], v[32:33], v[0:31] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 2
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[24:27], s[34:35] offset:96
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[28:31], s[34:35] offset:112
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[16:19], s[34:35] offset:64
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[20:23], s[34:35] offset:80
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[8:11], s[34:35] offset:32
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[12:15], s[34:35] offset:48
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[0:3], s[34:35]
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[4:7], s[34:35] offset:16
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x4bf16_1k:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v33, 0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, 1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v35, v33
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 2
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, s16
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, s17
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, s18
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, s19
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, s20
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, s21
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s22
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s23
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s25
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s26
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s27
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s28
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s29
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, s30
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, s31
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s3
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v20, s4
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v21, s5
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v22, s6
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v23, s7
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v24, s8
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v25, s9
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v26, s10
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v27, s11
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v28, s12
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v29, s13
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, s14
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, s15
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x4_2b_bf16 v[0:31], v[34:35], v[32:33], v[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 2
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[24:27], s[34:35] offset:96
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[28:31], s[34:35] offset:112
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[16:19], s[34:35] offset:64
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[20:23], s[34:35] offset:80
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[8:11], s[34:35] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[12:15], s[34:35] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[0:3], s[34:35]
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[4:7], s[34:35] offset:16
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%a = bitcast i64 1 to <4 x i16>
@@ -30,16 +254,134 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_16x16x4bf16_1k:
-; GCN-DAG: s_load_dwordx16
-; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GCN-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX90A: v_mfma_f32_16x16x4bf16_1k a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GFX942: v_mfma_f32_16x16x4_4b_bf16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT: v_accvgpr_read_b32
-; GCN-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #0 {
+; GFX90A-LABEL: test_mfma_f32_16x16x4bf16_1k:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: v_mov_b32_e32 v0, 2
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f32_16x16x4bf16_1k a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 2
+; GFX90A-NEXT: global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[16:17]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_f32_16x16x4bf16_1k:
+; GFX942: ; %bb.0: ; %bb
+; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 1
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX942-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX942-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX942-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX942-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX942-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX942-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX942-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX942-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX942-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX942-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX942-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mfma_f32_16x16x4_4b_bf16 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 2
+; GFX942-NEXT: global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
+; GFX942-NEXT: global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
+; GFX942-NEXT: global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
+; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[16:17]
+; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x4bf16_1k:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v17, 0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v18, 1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v19, v17
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, 2
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f32_16x16x4bf16_1k v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 2
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[0:3], s[16:17]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x4bf16_1k:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, 1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, v17
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 2
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x4_4b_bf16 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 2
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%a = bitcast i64 1 to <4 x i16>
@@ -49,16 +391,82 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_4x4x4bf16_1k:
-; GCN-DAG: s_load_dwordx4
-; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX90A: v_mfma_f32_4x4x4bf16_1k [[RES:a\[[0-9]+:[0-9]+\]]], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GFX942: v_mfma_f32_4x4x4_16b_bf16 [[RES:a\[[0-9]+:[0-9]+\]]], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT: v_accvgpr_read_b32
-; GCN: global_store_dwordx4 v{{[0-9]+}}, [[RES]],
define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0 {
+; GFX90A-LABEL: test_mfma_f32_4x4x4bf16_1k:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: v_mov_b32_e32 v0, 2
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f32_4x4x4bf16_1k a[0:3], v[2:3], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: s_nop 4
+; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_f32_4x4x4bf16_1k:
+; GFX942: ; %bb.0: ; %bb
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 1
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mfma_f32_4x4x4_16b_bf16 a[0:3], v[2:3], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: s_nop 4
+; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, 1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v5
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 2
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f32_4x4x4bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: s_nop 4
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v5
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 2
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_4x4x4_16b_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 4
+; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%a = bitcast i64 1 to <4 x i16>
@@ -68,16 +476,136 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x8bf16_1k:
-; GCN-DAG: s_load_dwordx16
-; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GCN-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX90A: v_mfma_f32_32x32x8bf16_1k a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GFX942: v_mfma_f32_32x32x8_bf16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT: v_accvgpr_read_b32
-; GCN-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #0 {
+; GFX90A-LABEL: test_mfma_f32_32x32x8bf16_1k:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: v_mov_b32_e32 v0, 2
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f32_32x32x8bf16_1k a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 2
+; GFX90A-NEXT: global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[16:17]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_f32_32x32x8bf16_1k:
+; GFX942: ; %bb.0: ; %bb
+; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 1
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX942-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX942-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX942-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX942-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX942-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX942-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX942-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX942-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX942-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX942-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX942-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mfma_f32_32x32x8_bf16 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 2
+; GFX942-NEXT: global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
+; GFX942-NEXT: global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
+; GFX942-NEXT: global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
+; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[16:17]
+; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f32_32x32x8bf16_1k:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v17, 0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v18, 1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v19, v17
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, 2
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f32_32x32x8bf16_1k v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 2
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[0:3], s[16:17]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x8bf16_1k:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, 1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, v17
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 2
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_bf16 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 2
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%a = bitcast i64 1 to <4 x i16>
@@ -87,16 +615,84 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_16x16x16bf16_1k:
-; GCN-DAG: s_load_dwordx4
-; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX90A: v_mfma_f32_16x16x16bf16_1k [[RES:a\[[0-9]+:[0-9]+\]]], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GFX942: v_mfma_f32_16x16x16_bf16 [[RES:a\[[0-9]+:[0-9]+\]]], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT: v_accvgpr_read_b32
-; GCN: global_store_dwordx4 v{{[0-9]+}}, [[RES]],
define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) #0 {
+; GFX90A-LABEL: test_mfma_f32_16x16x16bf16_1k:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: v_mov_b32_e32 v0, 2
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f32_16x16x16bf16_1k a[0:3], v[2:3], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 2
+; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_f32_16x16x16bf16_1k:
+; GFX942: ; %bb.0: ; %bb
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 1
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mfma_f32_16x16x16_bf16 a[0:3], v[2:3], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: s_nop 6
+; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, 1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v5
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 2
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f32_16x16x16bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 2
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v5
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 2
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x16_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 6
+; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%a = bitcast i64 1 to <4 x i16>
@@ -106,13 +702,70 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f64_4x4x4f64:
-; GFX90A: v_mfma_f64_4x4x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 0{{$}}
-; GFX90A: v_mfma_f64_4x4x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
-; GFX942: v_mfma_f64_4x4x4_4b_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 0{{$}}
-; GFX942: v_mfma_f64_4x4x4_4b_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0]
-; GCN: global_store_dwordx2
define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double %a, double %b) #0 {
+; GFX90A-LABEL: test_mfma_f64_4x4x4f64:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], 0
+; GFX90A-NEXT: s_nop 3
+; GFX90A-NEXT: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: global_store_dwordx2 v0, a[0:1], s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_f64_4x4x4f64:
+; GFX942: ; %bb.0: ; %bb
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], v[2:3], 0
+; GFX942-NEXT: s_nop 3
+; GFX942-NEXT: v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], v[2:3], a[0:1] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: global_store_dwordx2 v0, a[0:1], s[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_4x4x4f64:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[4:5], v[0:1], v[2:3], 0
+; GFX90A-VGPR-NEXT: s_nop 3
+; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_4x4x4f64:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[4:5], v[0:1], v[2:3], 0
+; GFX942-VGPR-NEXT: s_nop 3
+; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double 0.0, i32 0, i32 0, i32 0)
%mai.2 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double %mai.1, i32 1, i32 2, i32 3)
@@ -120,13 +773,110 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64:
-; GCN: s_load_dwordx8
-; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GFX942: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 neg:[1,1,0]
-; GCN: global_store_dwordx4
-; GCN: global_store_dwordx4
define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, double %a, double %b) #0 {
+; GFX90A-LABEL: test_mfma_f64_16x16x4f64:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v2, s10
+; GFX90A-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
+; GFX90A-NEXT: v_mov_b32_e32 v3, s11
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_f64_16x16x4f64:
+; GFX942: ; %bb.0: ; %bb
+; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
+; GFX942-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX942-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX942-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX942-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9]
+; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s10
+; GFX90A-VGPR-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s11
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 0
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s10
+; GFX942-VGPR-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s11
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <4 x double>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %in.1, i32 1, i32 2, i32 3)
@@ -134,14 +884,78 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm_0:
-; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 0{{$}}
-; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
-; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 0{{$}}
-; GFX942: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0]
-; GCN: global_store_dwordx4
-; GCN: global_store_dwordx4
define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_0(ptr addrspace(1) %arg, double %a, double %b) #0 {
+; GFX90A-LABEL: test_mfma_f64_16x16x4f64_splat_imm_0:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_0:
+; GFX942: ; %bb.0: ; %bb
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], 0
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_0:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 0
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 0
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_0:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 0
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> zeroinitializer, i32 0, i32 0, i32 0)
%mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
@@ -149,14 +963,78 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm_int_neg1:
-; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], -1{{$}}
-; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
-; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], -1{{$}}
-; GFX942: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0]
-; GCN: global_store_dwordx4
-; GCN: global_store_dwordx4
define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_neg1(ptr addrspace(1) %arg, double %a, double %b) #0 {
+; GFX90A-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_neg1:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -1
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_neg1:
+; GFX942: ; %bb.0: ; %bb
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], -1
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_neg1:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], -1
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 0
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_neg1:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], -1
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (i64 -1 to double)), i32 0, i32 0, i32 0)
%mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
@@ -164,14 +1042,78 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm_1:
-; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 1.0{{$}}
-; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
-; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 1.0{{$}}
-; GFX942: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0]
-; GCN: global_store_dwordx4
-; GCN: global_store_dwordx4
define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1) %arg, double %a, double %b) #0 {
+; GFX90A-LABEL: test_mfma_f64_16x16x4f64_splat_imm_1:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 1.0
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_1:
+; GFX942: ; %bb.0: ; %bb
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], 1.0
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_1:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 1.0
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 0
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_1:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 1.0
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double 1.0), i32 0, i32 0, i32 0)
%mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
@@ -179,14 +1121,78 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm_neg1:
-; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], -1.0{{$}}
-; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
-; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], -1.0{{$}}
-; GFX942: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0]
-; GCN: global_store_dwordx4
-; GCN: global_store_dwordx4
define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace(1) %arg, double %a, double %b) #0 {
+; GFX90A-LABEL: test_mfma_f64_16x16x4f64_splat_imm_neg1:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -1.0
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_neg1:
+; GFX942: ; %bb.0: ; %bb
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], -1.0
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_neg1:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], -1.0
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 0
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_neg1:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], -1.0
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double -1.0), i32 0, i32 0, i32 0)
%mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
@@ -194,14 +1200,78 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm_int_64:
-; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 64{{$}}
-; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
-; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 64{{$}}
-; GFX942: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0]
-; GCN: global_store_dwordx4
-; GCN: global_store_dwordx4
define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspace(1) %arg, double %a, double %b) #0 {
+; GFX90A-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 64
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64:
+; GFX942: ; %bb.0: ; %bb
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], 64
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 64
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 0
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 64
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (i64 64 to double)), i32 0, i32 0, i32 0)
%mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
@@ -209,23 +1279,116 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits:
-; GCN: v_accvgpr_write_b32 a[[A_LOW_BITS_0:[0-9]+]], 0{{$}}
-; GCN: v_accvgpr_write_b32 a[[A_HIGH_BITS_0:[0-9]+]], 64
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_HIGH_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_HIGH_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a[[LAST_CONST_REG:[0-9]+]], a[[A_HIGH_BITS_0]]
-
-; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}}
-; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
-; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}}
-; GFX942: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0]
-; GCN: global_store_dwordx4
-; GCN: global_store_dwordx4
define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits(ptr addrspace(1) %arg, double %a, double %b) #0 {
+; GFX90A-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, 64
+; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a1
+; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a1
+; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a1
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits:
+; GFX942: ; %bb.0: ; %bb
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a1, 64
+; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1
+; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a5, a1
+; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a7, a1
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, 64
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 64
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (i64 274877906944 to double)), i32 0, i32 0, i32 0)
%mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
@@ -233,23 +1396,110 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and_low:
-; GCN: v_accvgpr_write_b32 a[[A_LOW_BITS_0:[0-9]+]], 64{{$}}
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a[[LAST_CONST_REG:[0-9]+]], a[[A_LOW_BITS_0]]
-
-; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}}
-; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
-; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}}
-; GFX942: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0]
-; GCN: global_store_dwordx4
-; GCN: global_store_dwordx4
define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and_low(ptr addrspace(1) %arg, double %a, double %b) #0 {
+; GFX90A-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and_low:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, 64
+; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and_low:
+; GFX942: ; %bb.0: ; %bb
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT: v_accvgpr_write_b32 a0, 64
+; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and_low:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 64
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7]
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 0
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and_low:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 64
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7]
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (i64 274877907008 to double)), i32 0, i32 0, i32 0)
%mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
@@ -257,23 +1507,110 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_low:
-; GCN: v_accvgpr_write_b32 a[[A_LOW_BITS_0:[0-9]+]], 1.0
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a[[LAST_CONST_REG:[0-9]+]], a[[A_LOW_BITS_0]]
-
-; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}}
-; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
-; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}}
-; GFX942: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0]
-; GCN: global_store_dwordx4
-; GCN: global_store_dwordx4
define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_low(ptr addrspace(1) %arg, double %a, double %b) #0 {
+; GFX90A-LABEL: test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_low:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, 1.0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_low:
+; GFX942: ; %bb.0: ; %bb
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT: v_accvgpr_write_b32 a0, 1.0
+; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_low:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7]
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 0
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_low:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7]
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (<2 x float> splat (float 1.0) to double)), i32 0, i32 0, i32 0)
%mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
@@ -281,26 +1618,236 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_imm:
-; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}]{{$}}
-; GFX942: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}]{{$}}
-; GCN: global_store_dwordx4
-; GCN: global_store_dwordx4
define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, double %a, double %b) #0 {
+; GFX90A-LABEL: test_mfma_f64_16x16x4f64_imm:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x3ff00000
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v2, s2
+; GFX90A-NEXT: v_mov_b32_e32 v3, s3
+; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7]
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_f64_16x16x4f64_imm:
+; GFX942: ; %bb.0: ; %bb
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x3ff00000
+; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, s2
+; GFX942-NEXT: v_mov_b32_e32 v3, s3
+; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_imm:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, 0x3ff00000
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v12, s2
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v13, s3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_imm:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, 0x3ff00000
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s3
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 1.0>, i32 0, i32 0, i32 0)
store <4 x double> %mai.1, ptr addrspace(1) %arg
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_lit:
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x405ec000
-; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}]{{$}}
-; GFX942: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}]{{$}}
-; GCN: global_store_dwordx4
-; GCN: global_store_dwordx4
define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %arg, double %a, double %b) #0 {
+; GFX90A-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x405ec000
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v2, s2
+; GFX90A-NEXT: v_mov_b32_e32 v3, s3
+; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a1
+; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a1
+; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a1
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7]
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
+; GFX942: ; %bb.0: ; %bb
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x405ec000
+; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, s2
+; GFX942-NEXT: v_mov_b32_e32 v3, s3
+; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1
+; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a5, a1
+; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a7, a1
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, 0x405ec000
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v12, s2
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v13, s3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 0x405ec000
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s3
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 123.0, double 123.0, double 123.0, double 123.0>, i32 0, i32 0, i32 0)
store <4 x double> %mai.1, ptr addrspace(1) %arg
@@ -308,3 +1855,6 @@ bb:
}
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
+; VGPR: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
index b792a12..beda16c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
@@ -1,12 +1,9 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942,VGPRCD %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -global-isel < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL,VGPRCD %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942,AGPRCD %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 -global-isel < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL,AGPRCD %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-VGPRCD,GFX942-SDAG,GFX942-VGPRCD-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-VGPRCD,GFX942-GISEL,GFX942-VGPRCD-GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942,VGPRCD %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL,VGPRCD %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -stress-regalloc=10 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942,AGPRCD %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -stress-regalloc=10 -global-isel < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL,AGPRCD %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GFX950,GFX950-VGPRCD,GFX950-SDAG,GFX950-VGPRCD-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GFX950,GFX950-VGPRCD,GFX950-GISEL,GFX950-VGPRCD-GISEL %s
declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64, i64, <4 x i32>, i32, i32, i32)
declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64, i64, <16 x i32>, i32, i32, i32)
@@ -33,17 +30,132 @@ declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32>, <4 x i3
declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
-; GCN-LABEL: {{^}}test_mfma_i32_16x16x32i8:
-; GFX942-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GFX942-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GFX942-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
-; GFX942-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX942: v_mfma_i32_16x16x32_i8 a[{{[0-9]+:[0-9]+}}], v[[[TWO]]:[[ONE]]], v[[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GISEL: v_mfma_i32_16x16x32_i8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT: v_accvgpr_read_b32
-; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: test_mfma_i32_16x16x32i8:
+; GFX942-GISEL: ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT: s_nop 5
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: test_mfma_i32_16x16x32i8:
+; GFX950-GISEL: ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT: s_nop 6
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
bb:
%in.1 = load <4 x i32>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 4294967298, i64 12884901892, <4 x i32> %in.1, i32 1, i32 2, i32 3)
@@ -51,17 +163,154 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_i32_32x32x16i8:
-; GFX942-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GFX942-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GFX942-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
-; GFX942-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
-; GCN-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX942: v_mfma_i32_32x32x16_i8 a[{{[0-9]+:[0-9]+}}], v[[[TWO]]:[[ONE]]], v[[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GISEL: v_mfma_i32_32x32x16_i8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT: v_accvgpr_read_b32
-; GCN-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) #0 {
+; GFX942-SDAG-LABEL: test_mfma_i32_32x32x16i8:
+; GFX942-SDAG: ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-SDAG-NEXT: s_nop 1
+; GFX942-SDAG-NEXT: v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: s_nop 7
+; GFX942-SDAG-NEXT: s_nop 1
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: test_mfma_i32_32x32x16i8:
+; GFX942-GISEL: ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT: s_nop 7
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: test_mfma_i32_32x32x16i8:
+; GFX950-SDAG: ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT: s_nop 7
+; GFX950-SDAG-NEXT: s_nop 2
+; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: test_mfma_i32_32x32x16i8:
+; GFX950-GISEL: ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT: s_nop 7
+; GFX950-GISEL-NEXT: s_nop 2
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX950-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <16 x i32>, ptr addrspace(1) %arg
%mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64 4294967298, i64 12884901892, <16 x i32> %in.1, i32 1, i32 2, i32 3)
@@ -69,17 +318,132 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_bf8_bf8:
-; GFX942-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GFX942-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GFX942-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
-; GFX942-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX942: v_mfma_f32_16x16x32_bf8_bf8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GISEL: v_mfma_f32_16x16x32_bf8_bf8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT: v_accvgpr_read_b32
-; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX942-GISEL: ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT: s_nop 5
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX950-GISEL: ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT: s_nop 6
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -87,17 +451,132 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_bf8_fp8:
-; GFX942-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GFX942-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GFX942-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
-; GFX942-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX942: v_mfma_f32_16x16x32_bf8_fp8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GISEL: v_mfma_f32_16x16x32_bf8_fp8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT: v_accvgpr_read_b32
-; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX942-GISEL: ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT: s_nop 5
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX950-GISEL: ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT: s_nop 6
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -105,17 +584,132 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_fp8_bf8:
-; GFX942-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GFX942-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GFX942-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
-; GFX942-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX942: v_mfma_f32_16x16x32_fp8_bf8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GISEL: v_mfma_f32_16x16x32_fp8_bf8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT: v_accvgpr_read_b32
-; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX942-GISEL: ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT: s_nop 5
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX950-GISEL: ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT: s_nop 6
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -123,17 +717,132 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_fp8_fp8:
-; GFX942-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GFX942-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GFX942-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
-; GFX942-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX942: v_mfma_f32_16x16x32_fp8_fp8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GISEL: v_mfma_f32_16x16x32_fp8_fp8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT: v_accvgpr_read_b32
-; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX942-GISEL: ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT: s_nop 5
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX950-GISEL: ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT: s_nop 6
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -141,17 +850,154 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_bf8_bf8:
-; GFX942-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GFX942-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GFX942-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
-; GFX942-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX942: v_mfma_f32_32x32x16_bf8_bf8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GISEL: v_mfma_f32_32x32x16_bf8_bf8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT: v_accvgpr_read_b32
-; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg) #0 {
+; GFX942-SDAG-LABEL: test_mfma_f32_32x32x16_bf8_bf8:
+; GFX942-SDAG: ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-SDAG-NEXT: s_nop 1
+; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: s_nop 7
+; GFX942-SDAG-NEXT: s_nop 1
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: test_mfma_f32_32x32x16_bf8_bf8:
+; GFX942-GISEL: ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT: s_nop 7
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: test_mfma_f32_32x32x16_bf8_bf8:
+; GFX950-SDAG: ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT: s_nop 7
+; GFX950-SDAG-NEXT: s_nop 2
+; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: test_mfma_f32_32x32x16_bf8_bf8:
+; GFX950-GISEL: ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT: s_nop 7
+; GFX950-GISEL-NEXT: s_nop 2
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX950-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 1, i32 2, i32 3)
@@ -159,17 +1005,154 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_bf8_fp8:
-; GFX942-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GFX942-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GFX942-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
-; GFX942-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX942: v_mfma_f32_32x32x16_bf8_fp8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GISEL: v_mfma_f32_32x32x16_bf8_fp8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT: v_accvgpr_read_b32
-; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg) #0 {
+; GFX942-SDAG-LABEL: test_mfma_f32_32x32x16_bf8_fp8:
+; GFX942-SDAG: ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-SDAG-NEXT: s_nop 1
+; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: s_nop 7
+; GFX942-SDAG-NEXT: s_nop 1
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: test_mfma_f32_32x32x16_bf8_fp8:
+; GFX942-GISEL: ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT: s_nop 7
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: test_mfma_f32_32x32x16_bf8_fp8:
+; GFX950-SDAG: ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT: s_nop 7
+; GFX950-SDAG-NEXT: s_nop 2
+; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: test_mfma_f32_32x32x16_bf8_fp8:
+; GFX950-GISEL: ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT: s_nop 7
+; GFX950-GISEL-NEXT: s_nop 2
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX950-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.fp8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 1, i32 2, i32 3)
@@ -177,17 +1160,154 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_fp8_bf8:
-; GFX942-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GFX942-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GFX942-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
-; GFX942-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX942: v_mfma_f32_32x32x16_fp8_bf8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GISEL: v_mfma_f32_32x32x16_fp8_bf8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT: v_accvgpr_read_b32
-; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg) #0 {
+; GFX942-SDAG-LABEL: test_mfma_f32_32x32x16_fp8_bf8:
+; GFX942-SDAG: ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-SDAG-NEXT: s_nop 1
+; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: s_nop 7
+; GFX942-SDAG-NEXT: s_nop 1
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: test_mfma_f32_32x32x16_fp8_bf8:
+; GFX942-GISEL: ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT: s_nop 7
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: test_mfma_f32_32x32x16_fp8_bf8:
+; GFX950-SDAG: ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT: s_nop 7
+; GFX950-SDAG-NEXT: s_nop 2
+; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: test_mfma_f32_32x32x16_fp8_bf8:
+; GFX950-GISEL: ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT: s_nop 7
+; GFX950-GISEL-NEXT: s_nop 2
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX950-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.bf8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 1, i32 2, i32 3)
@@ -195,17 +1315,154 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_fp8_fp8:
-; GFX942-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GFX942-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GFX942-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3
-; GFX942-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX942: v_mfma_f32_32x32x16_fp8_fp8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GISEL: v_mfma_f32_32x32x16_fp8_fp8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT: v_accvgpr_read_b32
-; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg) #0 {
+; GFX942-SDAG-LABEL: test_mfma_f32_32x32x16_fp8_fp8:
+; GFX942-SDAG: ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-SDAG-NEXT: s_nop 1
+; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: s_nop 7
+; GFX942-SDAG-NEXT: s_nop 1
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: test_mfma_f32_32x32x16_fp8_fp8:
+; GFX942-GISEL: ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT: s_nop 7
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: test_mfma_f32_32x32x16_fp8_fp8:
+; GFX950-SDAG: ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT: s_nop 7
+; GFX950-SDAG-NEXT: s_nop 2
+; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: test_mfma_f32_32x32x16_fp8_fp8:
+; GFX950-GISEL: ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT: s_nop 7
+; GFX950-GISEL-NEXT: s_nop 2
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX950-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 1, i32 2, i32 3)
@@ -213,15 +1470,132 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_smfmac_f32_16x16x32_f16:
-; GCN: s_load_dwordx4 s[[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]][[[RLO:[0-9]+]]:{{[0-9]+}}], s[[[SLO]]:{{[0-9]+}}]{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
-; GCN: v_smfmac_f32_16x16x32_f16 [[CD]][[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
-; GCN: global_store_dwordx4 v{{[0-9]+}}, [[CD]][[[RLO]]:[[RHI]]]
define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_f32_16x16x32_f16:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v7, s6
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
+; GFX942-VGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s6
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_f16:
+; GFX942-AGPRCD: ; %bb.0: ; %bb
+; GFX942-AGPRCD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-AGPRCD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-AGPRCD-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX942-AGPRCD-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-AGPRCD-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-AGPRCD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-NEXT: s_load_dword s0, s[4:5], 0x44
+; GFX942-AGPRCD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-NEXT: v_mov_b32_e32 v6, s0
+; GFX942-AGPRCD-NEXT: s_nop 1
+; GFX942-AGPRCD-NEXT: v_smfmac_f32_16x16x32_f16 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-NEXT: s_nop 5
+; GFX942-AGPRCD-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9]
+; GFX942-AGPRCD-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_f32_16x16x32_f16:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v7, s6
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
+; GFX950-VGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s6
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-LABEL: test_smfmac_f32_16x16x32_f16:
+; GFX950-AGPRCD: ; %bb.0: ; %bb
+; GFX950-AGPRCD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-AGPRCD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
+; GFX950-AGPRCD-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX950-AGPRCD-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-AGPRCD-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-AGPRCD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-NEXT: s_load_dword s0, s[4:5], 0x44
+; GFX950-AGPRCD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-NEXT: v_mov_b32_e32 v6, s0
+; GFX950-AGPRCD-NEXT: s_nop 1
+; GFX950-AGPRCD-NEXT: v_smfmac_f32_16x16x32_f16 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-NEXT: s_nop 6
+; GFX950-AGPRCD-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9]
+; GFX950-AGPRCD-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half> %a, <8 x half> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
@@ -229,18 +1603,278 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_smfmac_f32_32x32x16_f16:
-; GCN: s_load_dwordx16 s[[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]][[[RLO:[0-9]+]]:{{[0-9]+}}], s[[[SLO]]:{{[0-9]+}}]{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
-; GCN: v_smfmac_f32_32x32x16_f16 [[CD]][[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
-; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
-; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
-; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
-; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_f16:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s24
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_f16:
+; GFX942-VGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44
+; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v22, s24
+; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 7
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_f16:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dword s0, s[4:5], 0x44
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[26:27]
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx16 s[8:23], s[24:25], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[28:29]
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, s0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[30:31]
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a4, s12
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a5, s13
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a6, s14
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a7, s15
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a8, s16
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a9, s17
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a10, s18
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a11, s19
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a12, s20
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a13, s21
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a14, s22
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[24:25] offset:48
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[24:25] offset:32
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[24:25] offset:16
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[24:25]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_f16:
+; GFX942-AGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dword s0, s[4:5], 0x44
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[26:27]
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx16 s[8:23], s[24:25], 0x0
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[28:29]
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s0
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[30:31]
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a4, s12
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a5, s13
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a6, s14
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a7, s15
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a8, s16
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a9, s17
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a10, s18
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a11, s19
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a12, s20
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a13, s21
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a14, s22
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x16_f16 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 7
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[24:25]
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[24:25] offset:16
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[24:25] offset:32
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[24:25] offset:48
+; GFX942-AGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_f16:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s24
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 2
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_f16:
+; GFX950-VGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44
+; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v22, s24
+; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v16, 0
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 7
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 2
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_f16:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dword s0, s[4:5], 0x44
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[26:27]
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx16 s[8:23], s[24:25], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[28:29]
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, s0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[30:31]
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a4, s12
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a5, s13
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a6, s14
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a7, s15
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a8, s16
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a9, s17
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a10, s18
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a11, s19
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a12, s20
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a13, s21
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a14, s22
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 2
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[24:25] offset:48
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[24:25] offset:32
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[24:25] offset:16
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[24:25]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_f16:
+; GFX950-AGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dword s0, s[4:5], 0x44
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[26:27]
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx16 s[8:23], s[24:25], 0x0
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[28:29]
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s0
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[30:31]
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a4, s12
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a5, s13
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a6, s14
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a7, s15
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a8, s16
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a9, s17
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a10, s18
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a11, s19
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a12, s20
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a13, s21
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a14, s22
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x16_f16 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 7
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 2
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[24:25]
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[24:25] offset:16
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[24:25] offset:32
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[24:25] offset:48
+; GFX950-AGPRCD-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half> %a, <8 x half> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
@@ -248,15 +1882,132 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_smfmac_f32_16x16x32_bf16:
-; GCN: s_load_dwordx4 s[[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]][[[RLO:[0-9]+]]:{{[0-9]+}}], s[[[SLO]]:{{[0-9]+}}]{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
-; GCN: v_smfmac_f32_16x16x32_bf16 [[CD]][[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
-; GCN: global_store_dwordx4 v{{[0-9]+}}, [[CD]][[[RLO]]:[[RHI]]]
define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_f32_16x16x32_bf16:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v7, s6
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
+; GFX942-VGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s6
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_bf16:
+; GFX942-AGPRCD: ; %bb.0: ; %bb
+; GFX942-AGPRCD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-AGPRCD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-AGPRCD-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX942-AGPRCD-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-AGPRCD-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-AGPRCD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-NEXT: s_load_dword s0, s[4:5], 0x44
+; GFX942-AGPRCD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-NEXT: v_mov_b32_e32 v6, s0
+; GFX942-AGPRCD-NEXT: s_nop 1
+; GFX942-AGPRCD-NEXT: v_smfmac_f32_16x16x32_bf16 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-NEXT: s_nop 5
+; GFX942-AGPRCD-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9]
+; GFX942-AGPRCD-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_f32_16x16x32_bf16:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v7, s6
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
+; GFX950-VGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s6
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-LABEL: test_smfmac_f32_16x16x32_bf16:
+; GFX950-AGPRCD: ; %bb.0: ; %bb
+; GFX950-AGPRCD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-AGPRCD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
+; GFX950-AGPRCD-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX950-AGPRCD-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-AGPRCD-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-AGPRCD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-NEXT: s_load_dword s0, s[4:5], 0x44
+; GFX950-AGPRCD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-NEXT: v_mov_b32_e32 v6, s0
+; GFX950-AGPRCD-NEXT: s_nop 1
+; GFX950-AGPRCD-NEXT: v_smfmac_f32_16x16x32_bf16 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-NEXT: s_nop 6
+; GFX950-AGPRCD-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9]
+; GFX950-AGPRCD-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16> %a, <8 x i16> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
@@ -264,18 +2015,278 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_smfmac_f32_32x32x16_bf16:
-; GCN: s_load_dwordx16 s[[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]][[[RLO:[0-9]+]]:{{[0-9]+}}], s[[[SLO]]:{{[0-9]+}}]{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
-; GCN: v_smfmac_f32_32x32x16_bf16 [[CD]][[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
-; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
-; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
-; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
-; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s24
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16:
+; GFX942-VGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44
+; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v22, s24
+; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 7
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dword s0, s[4:5], 0x44
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[26:27]
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx16 s[8:23], s[24:25], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[28:29]
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, s0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[30:31]
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a4, s12
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a5, s13
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a6, s14
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a7, s15
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a8, s16
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a9, s17
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a10, s18
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a11, s19
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a12, s20
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a13, s21
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a14, s22
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[24:25] offset:48
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[24:25] offset:32
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[24:25] offset:16
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[24:25]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16:
+; GFX942-AGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dword s0, s[4:5], 0x44
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[26:27]
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx16 s[8:23], s[24:25], 0x0
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[28:29]
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s0
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[30:31]
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a4, s12
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a5, s13
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a6, s14
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a7, s15
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a8, s16
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a9, s17
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a10, s18
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a11, s19
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a12, s20
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a13, s21
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a14, s22
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x16_bf16 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 7
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[24:25]
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[24:25] offset:16
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[24:25] offset:32
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[24:25] offset:48
+; GFX942-AGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s24
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 2
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16:
+; GFX950-VGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44
+; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19]
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v22, s24
+; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v16, 0
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 7
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 2
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dword s0, s[4:5], 0x44
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[26:27]
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx16 s[8:23], s[24:25], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[28:29]
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, s0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[30:31]
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a4, s12
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a5, s13
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a6, s14
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a7, s15
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a8, s16
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a9, s17
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a10, s18
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a11, s19
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a12, s20
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a13, s21
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a14, s22
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 2
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[24:25] offset:48
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[24:25] offset:32
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[24:25] offset:16
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[24:25]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16:
+; GFX950-AGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dword s0, s[4:5], 0x44
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[26:27]
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx16 s[8:23], s[24:25], 0x0
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[28:29]
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s0
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[30:31]
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a4, s12
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a5, s13
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a6, s14
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a7, s15
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a8, s16
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a9, s17
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a10, s18
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a11, s19
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a12, s20
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a13, s21
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a14, s22
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x16_bf16 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 7
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 2
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[24:25]
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[24:25] offset:16
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[24:25] offset:32
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[24:25] offset:48
+; GFX950-AGPRCD-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16> %a, <8 x i16> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
@@ -283,15 +2294,214 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_i8:
-; GCN: s_load_dwordx4 s[[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]][[[RLO:[0-9]+]]:{{[0-9]+}}], s[[[SLO]]:{{[0-9]+}}]{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
-; GCN: v_smfmac_i32_16x16x64_i8 [[CD]][[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
-; GCN: global_store_dwordx4 v{{[0-9]+}}, [[CD]][[[RLO]]:[[RHI]]]
define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
+; GFX942-VGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
+; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
+; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s8
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s9
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, s14
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 5
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
+; GFX942-AGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-GISEL-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dword s0, s[4:5], 0x44
+; GFX942-AGPRCD-GISEL-NEXT: s_mov_b32 s8, s2
+; GFX942-AGPRCD-GISEL-NEXT: s_mov_b32 s9, s3
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-AGPRCD-GISEL-NEXT: v_writelane_b32 v7, s0, 0
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-GISEL-NEXT: v_readlane_b32 s0, v7, 0
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s0
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 5
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-AGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
+; GFX950-VGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
+; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
+; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s8
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s9
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, s14
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
+; GFX950-AGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-GISEL-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dword s0, s[4:5], 0x44
+; GFX950-AGPRCD-GISEL-NEXT: s_mov_b32 s8, s2
+; GFX950-AGPRCD-GISEL-NEXT: s_mov_b32 s9, s3
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX950-AGPRCD-GISEL-NEXT: v_writelane_b32 v7, s0, 0
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-GISEL-NEXT: v_readlane_b32 s0, v7, 0
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s0
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 6
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-AGPRCD-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <4 x i32>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32> %a, <4 x i32> %b, <4 x i32> %in.1, i32 %idx, i32 1, i32 2)
@@ -299,18 +2509,310 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_i8:
-; GCN: s_load_dwordx16 s[[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]][[[RLO:[0-9]+]]:{{[0-9]+}}], s[[[SLO]]:{{[0-9]+}}]{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
-; GCN: v_smfmac_i32_32x32x32_i8 [[CD]][[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
-; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
-; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
-; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
-; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_i8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
+; GFX942-VGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
+; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s20, s18
+; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s21, s19
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v22, s26
+; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 7
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_i8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a4, s12
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a5, s13
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a6, s14
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a7, s15
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a8, s16
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a9, s17
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a10, s18
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a11, s19
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a12, s20
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a13, s21
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a14, s22
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s8
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s9
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, s14
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
+; GFX942-AGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[24:27], s[4:5], 0x2c
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[24:25]
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a4, s12
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a5, s13
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a6, s14
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a7, s15
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a8, s16
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a9, s17
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a10, s18
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a11, s19
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a12, s20
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a13, s21
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a14, s22
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dword s2, s[4:5], 0x44
+; GFX942-AGPRCD-GISEL-NEXT: s_mov_b32 s8, s26
+; GFX942-AGPRCD-GISEL-NEXT: s_mov_b32 s9, s27
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s2
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT: v_smfmac_i32_32x32x32_i8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 7
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-AGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_i8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 2
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
+; GFX950-VGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
+; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s20, s18
+; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s21, s19
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v22, s26
+; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v16, 0
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 7
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 2
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_i8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a4, s12
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a5, s13
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a6, s14
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a7, s15
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a8, s16
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a9, s17
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a10, s18
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a11, s19
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a12, s20
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a13, s21
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a14, s22
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s8
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s9
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, s14
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 2
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
+; GFX950-AGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[24:27], s[4:5], 0x2c
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[24:25]
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a4, s12
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a5, s13
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a6, s14
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a7, s15
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a8, s16
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a9, s17
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a10, s18
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a11, s19
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a12, s20
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a13, s21
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a14, s22
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dword s2, s[4:5], 0x44
+; GFX950-AGPRCD-GISEL-NEXT: s_mov_b32 s8, s26
+; GFX950-AGPRCD-GISEL-NEXT: s_mov_b32 s9, s27
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s2
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT: v_smfmac_i32_32x32x32_i8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 7
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 2
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX950-AGPRCD-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <16 x i32>, ptr addrspace(1) %arg
%mai.1 = tail call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32> %a, <4 x i32> %b, <16 x i32> %in.1, i32 %idx, i32 1, i32 2)
@@ -318,15 +2820,214 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_bf8_bf8:
-; GCN: s_load_dwordx4 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
-; GCN: v_smfmac_f32_16x16x64_bf8_bf8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
-; GCN: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:[[RHI]]]
define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
+; GFX942-VGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
+; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
+; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s8
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s9
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, s14
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 5
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
+; GFX942-AGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-GISEL-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dword s0, s[4:5], 0x44
+; GFX942-AGPRCD-GISEL-NEXT: s_mov_b32 s8, s2
+; GFX942-AGPRCD-GISEL-NEXT: s_mov_b32 s9, s3
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-AGPRCD-GISEL-NEXT: v_writelane_b32 v7, s0, 0
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-GISEL-NEXT: v_readlane_b32 s0, v7, 0
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s0
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 5
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-AGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
+; GFX950-VGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
+; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
+; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s8
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s9
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, s14
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
+; GFX950-AGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-GISEL-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dword s0, s[4:5], 0x44
+; GFX950-AGPRCD-GISEL-NEXT: s_mov_b32 s8, s2
+; GFX950-AGPRCD-GISEL-NEXT: s_mov_b32 s9, s3
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX950-AGPRCD-GISEL-NEXT: v_writelane_b32 v7, s0, 0
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-GISEL-NEXT: v_readlane_b32 s0, v7, 0
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s0
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 6
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-AGPRCD-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
@@ -334,15 +3035,214 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_bf8_fp8:
-; GCN: s_load_dwordx4 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
-; GCN: v_smfmac_f32_16x16x64_bf8_fp8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
-; GCN: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:[[RHI]]]
define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
+; GFX942-VGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
+; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
+; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s8
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s9
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, s14
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 5
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
+; GFX942-AGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-GISEL-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dword s0, s[4:5], 0x44
+; GFX942-AGPRCD-GISEL-NEXT: s_mov_b32 s8, s2
+; GFX942-AGPRCD-GISEL-NEXT: s_mov_b32 s9, s3
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-AGPRCD-GISEL-NEXT: v_writelane_b32 v7, s0, 0
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-GISEL-NEXT: v_readlane_b32 s0, v7, 0
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s0
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 5
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-AGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
+; GFX950-VGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
+; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
+; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s8
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s9
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, s14
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
+; GFX950-AGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-GISEL-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dword s0, s[4:5], 0x44
+; GFX950-AGPRCD-GISEL-NEXT: s_mov_b32 s8, s2
+; GFX950-AGPRCD-GISEL-NEXT: s_mov_b32 s9, s3
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX950-AGPRCD-GISEL-NEXT: v_writelane_b32 v7, s0, 0
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-GISEL-NEXT: v_readlane_b32 s0, v7, 0
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s0
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 6
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-AGPRCD-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
@@ -350,15 +3250,214 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_fp8_bf8:
-; GCN: s_load_dwordx4 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
-; GCN: v_smfmac_f32_16x16x64_fp8_bf8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
-; GCN: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:[[RHI]]]
define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
+; GFX942-VGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
+; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
+; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s8
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s9
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, s14
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 5
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
+; GFX942-AGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-GISEL-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dword s0, s[4:5], 0x44
+; GFX942-AGPRCD-GISEL-NEXT: s_mov_b32 s8, s2
+; GFX942-AGPRCD-GISEL-NEXT: s_mov_b32 s9, s3
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-AGPRCD-GISEL-NEXT: v_writelane_b32 v7, s0, 0
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-GISEL-NEXT: v_readlane_b32 s0, v7, 0
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s0
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 5
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-AGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
+; GFX950-VGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
+; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
+; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s8
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s9
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, s14
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
+; GFX950-AGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-GISEL-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dword s0, s[4:5], 0x44
+; GFX950-AGPRCD-GISEL-NEXT: s_mov_b32 s8, s2
+; GFX950-AGPRCD-GISEL-NEXT: s_mov_b32 s9, s3
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX950-AGPRCD-GISEL-NEXT: v_writelane_b32 v7, s0, 0
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-GISEL-NEXT: v_readlane_b32 s0, v7, 0
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s0
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 6
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-AGPRCD-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
@@ -366,15 +3465,214 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_fp8_fp8:
-; GCN: s_load_dwordx4 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
-; GCN: v_smfmac_f32_16x16x64_fp8_fp8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
-; GCN: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:[[RHI]]]
define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
+; GFX942-VGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
+; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
+; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s8
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s9
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, s14
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 5
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
+; GFX942-AGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-GISEL-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dword s0, s[4:5], 0x44
+; GFX942-AGPRCD-GISEL-NEXT: s_mov_b32 s8, s2
+; GFX942-AGPRCD-GISEL-NEXT: s_mov_b32 s9, s3
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-AGPRCD-GISEL-NEXT: v_writelane_b32 v7, s0, 0
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-GISEL-NEXT: v_readlane_b32 s0, v7, 0
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s0
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 5
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-AGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
+; GFX950-VGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
+; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
+; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s8
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s9
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, s14
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
+; GFX950-AGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-GISEL-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dword s0, s[4:5], 0x44
+; GFX950-AGPRCD-GISEL-NEXT: s_mov_b32 s8, s2
+; GFX950-AGPRCD-GISEL-NEXT: s_mov_b32 s9, s3
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX950-AGPRCD-GISEL-NEXT: v_writelane_b32 v7, s0, 0
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-GISEL-NEXT: v_readlane_b32 s0, v7, 0
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s0
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 6
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-AGPRCD-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
@@ -382,18 +3680,310 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_bf8_bf8:
-; GCN: s_load_dwordx16 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
-; GCN: v_smfmac_f32_32x32x32_bf8_bf8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
-; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
-; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
-; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
-; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
+; GFX942-VGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
+; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s20, s18
+; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s21, s19
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v22, s26
+; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 7
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a4, s12
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a5, s13
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a6, s14
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a7, s15
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a8, s16
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a9, s17
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a10, s18
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a11, s19
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a12, s20
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a13, s21
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a14, s22
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s8
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s9
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, s14
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
+; GFX942-AGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[24:27], s[4:5], 0x2c
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[24:25]
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a4, s12
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a5, s13
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a6, s14
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a7, s15
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a8, s16
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a9, s17
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a10, s18
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a11, s19
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a12, s20
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a13, s21
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a14, s22
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dword s2, s[4:5], 0x44
+; GFX942-AGPRCD-GISEL-NEXT: s_mov_b32 s8, s26
+; GFX942-AGPRCD-GISEL-NEXT: s_mov_b32 s9, s27
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s2
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 7
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-AGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 2
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
+; GFX950-VGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
+; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s20, s18
+; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s21, s19
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v22, s26
+; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v16, 0
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 7
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 2
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a4, s12
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a5, s13
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a6, s14
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a7, s15
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a8, s16
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a9, s17
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a10, s18
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a11, s19
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a12, s20
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a13, s21
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a14, s22
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s8
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s9
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, s14
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 2
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
+; GFX950-AGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[24:27], s[4:5], 0x2c
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[24:25]
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a4, s12
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a5, s13
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a6, s14
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a7, s15
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a8, s16
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a9, s17
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a10, s18
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a11, s19
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a12, s20
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a13, s21
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a14, s22
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dword s2, s[4:5], 0x44
+; GFX950-AGPRCD-GISEL-NEXT: s_mov_b32 s8, s26
+; GFX950-AGPRCD-GISEL-NEXT: s_mov_b32 s9, s27
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s2
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 7
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 2
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX950-AGPRCD-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
@@ -401,18 +3991,310 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_bf8_fp8:
-; GCN: s_load_dwordx16 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
-; GCN: v_smfmac_f32_32x32x32_bf8_fp8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
-; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
-; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
-; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
-; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
+; GFX942-VGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
+; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s20, s18
+; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s21, s19
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v22, s26
+; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 7
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a4, s12
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a5, s13
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a6, s14
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a7, s15
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a8, s16
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a9, s17
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a10, s18
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a11, s19
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a12, s20
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a13, s21
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a14, s22
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s8
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s9
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, s14
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
+; GFX942-AGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[24:27], s[4:5], 0x2c
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[24:25]
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a4, s12
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a5, s13
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a6, s14
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a7, s15
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a8, s16
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a9, s17
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a10, s18
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a11, s19
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a12, s20
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a13, s21
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a14, s22
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dword s2, s[4:5], 0x44
+; GFX942-AGPRCD-GISEL-NEXT: s_mov_b32 s8, s26
+; GFX942-AGPRCD-GISEL-NEXT: s_mov_b32 s9, s27
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s2
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 7
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-AGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 2
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
+; GFX950-VGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
+; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s20, s18
+; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s21, s19
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v22, s26
+; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v16, 0
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 7
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 2
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a4, s12
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a5, s13
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a6, s14
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a7, s15
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a8, s16
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a9, s17
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a10, s18
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a11, s19
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a12, s20
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a13, s21
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a14, s22
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s8
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s9
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, s14
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 2
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
+; GFX950-AGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[24:27], s[4:5], 0x2c
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[24:25]
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a4, s12
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a5, s13
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a6, s14
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a7, s15
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a8, s16
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a9, s17
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a10, s18
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a11, s19
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a12, s20
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a13, s21
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a14, s22
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dword s2, s[4:5], 0x44
+; GFX950-AGPRCD-GISEL-NEXT: s_mov_b32 s8, s26
+; GFX950-AGPRCD-GISEL-NEXT: s_mov_b32 s9, s27
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s2
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 7
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 2
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX950-AGPRCD-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
@@ -420,18 +4302,310 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_fp8_bf8:
-; GCN: s_load_dwordx16 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
-; GCN: v_smfmac_f32_32x32x32_fp8_bf8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
-; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
-; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
-; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
-; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
+; GFX942-VGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
+; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s20, s18
+; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s21, s19
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v22, s26
+; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 7
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a4, s12
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a5, s13
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a6, s14
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a7, s15
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a8, s16
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a9, s17
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a10, s18
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a11, s19
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a12, s20
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a13, s21
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a14, s22
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s8
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s9
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, s14
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
+; GFX942-AGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[24:27], s[4:5], 0x2c
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[24:25]
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a4, s12
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a5, s13
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a6, s14
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a7, s15
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a8, s16
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a9, s17
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a10, s18
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a11, s19
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a12, s20
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a13, s21
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a14, s22
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dword s2, s[4:5], 0x44
+; GFX942-AGPRCD-GISEL-NEXT: s_mov_b32 s8, s26
+; GFX942-AGPRCD-GISEL-NEXT: s_mov_b32 s9, s27
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s2
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 7
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-AGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 2
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
+; GFX950-VGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
+; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s20, s18
+; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s21, s19
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v22, s26
+; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v16, 0
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 7
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 2
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a4, s12
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a5, s13
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a6, s14
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a7, s15
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a8, s16
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a9, s17
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a10, s18
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a11, s19
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a12, s20
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a13, s21
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a14, s22
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s8
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s9
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, s14
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 2
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
+; GFX950-AGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[24:27], s[4:5], 0x2c
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[24:25]
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a4, s12
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a5, s13
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a6, s14
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a7, s15
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a8, s16
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a9, s17
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a10, s18
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a11, s19
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a12, s20
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a13, s21
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a14, s22
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dword s2, s[4:5], 0x44
+; GFX950-AGPRCD-GISEL-NEXT: s_mov_b32 s8, s26
+; GFX950-AGPRCD-GISEL-NEXT: s_mov_b32 s9, s27
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s2
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 7
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 2
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX950-AGPRCD-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
@@ -439,18 +4613,310 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_fp8_fp8:
-; GCN: s_load_dwordx16 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
-; GCN: v_smfmac_f32_32x32x32_fp8_fp8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
-; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
-; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
-; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
-; GCN-DAG: global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
+; GFX942-VGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
+; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s20, s18
+; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s21, s19
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v22, s26
+; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 7
+; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a4, s12
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a5, s13
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a6, s14
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a7, s15
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a8, s16
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a9, s17
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a10, s18
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a11, s19
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a12, s20
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a13, s21
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a14, s22
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s8
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s9
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, s14
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
+; GFX942-AGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[24:27], s[4:5], 0x2c
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[24:25]
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a4, s12
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a5, s13
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a6, s14
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a7, s15
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a8, s16
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a9, s17
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a10, s18
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a11, s19
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a12, s20
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a13, s21
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a14, s22
+; GFX942-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX942-AGPRCD-GISEL-NEXT: s_load_dword s2, s[4:5], 0x44
+; GFX942-AGPRCD-GISEL-NEXT: s_mov_b32 s8, s26
+; GFX942-AGPRCD-GISEL-NEXT: s_mov_b32 s9, s27
+; GFX942-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s2
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 7
+; GFX942-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-AGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 2
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
+; GFX950-VGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44
+; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s20, s18
+; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s21, s19
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v22, s26
+; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v16, 0
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 7
+; GFX950-VGPRCD-GISEL-NEXT: s_nop 2
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a4, s12
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a5, s13
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a6, s14
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a7, s15
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a8, s16
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a9, s17
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a10, s18
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a11, s19
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a12, s20
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a13, s21
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a14, s22
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s8
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s9
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, s14
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 2
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
+; GFX950-AGPRCD-GISEL: ; %bb.0: ; %bb
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[24:27], s[4:5], 0x2c
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[24:25]
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a4, s12
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a5, s13
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a6, s14
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a7, s15
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a8, s16
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a9, s17
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a10, s18
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a11, s19
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a12, s20
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a13, s21
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a14, s22
+; GFX950-AGPRCD-GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX950-AGPRCD-GISEL-NEXT: s_load_dword s2, s[4:5], 0x44
+; GFX950-AGPRCD-GISEL-NEXT: s_mov_b32 s8, s26
+; GFX950-AGPRCD-GISEL-NEXT: s_mov_b32 s9, s27
+; GFX950-AGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s2
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 7
+; GFX950-AGPRCD-GISEL-NEXT: s_nop 2
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX950-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX950-AGPRCD-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
@@ -459,3 +4925,8 @@ bb:
}
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX942: {{.*}}
+; GFX942-VGPRCD: {{.*}}
+; GFX950: {{.*}}
+; GFX950-VGPRCD: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
index 452033f..8081a15 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
@@ -15,9 +15,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GCN-NEXT: v_mov_b64_e32 v[12:13], 48
-; GCN-NEXT: v_mov_b64_e32 v[14:15], 32
-; GCN-NEXT: v_mov_b64_e32 v[16:17], 16
+; GCN-NEXT: v_mov_b64_e32 v[8:9], 48
+; GCN-NEXT: v_mov_b64_e32 v[10:11], 32
+; GCN-NEXT: v_mov_b64_e32 v[12:13], 16
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -39,42 +39,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x
; GCN-NEXT: v_accvgpr_write_b32 a13, s21
; GCN-NEXT: v_accvgpr_write_b32 a14, s22
; GCN-NEXT: v_accvgpr_write_b32 a15, s23
-; GCN-NEXT: v_mov_b64_e32 v[18:19], 0
-; GCN-NEXT: v_mov_b32_e32 v8, s16
+; GCN-NEXT: v_mov_b64_e32 v[14:15], 0
+; GCN-NEXT: v_mov_b32_e32 v16, s16
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15]
; GCN-NEXT: v_mov_b32_e32 v0, s20
; GCN-NEXT: v_mov_b32_e32 v1, s21
; GCN-NEXT: v_mov_b32_e32 v2, s22
; GCN-NEXT: v_mov_b32_e32 v3, s23
-; GCN-NEXT: v_mov_b32_e32 v9, s17
-; GCN-NEXT: v_mov_b32_e32 v10, s18
-; GCN-NEXT: v_mov_b32_e32 v11, s19
+; GCN-NEXT: v_mov_b32_e32 v17, s17
+; GCN-NEXT: v_mov_b32_e32 v18, s18
+; GCN-NEXT: v_mov_b32_e32 v19, s19
; GCN-NEXT: s_nop 4
-; GCN-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s12
; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_mov_b32_e32 v2, s14
; GCN-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
@@ -88,9 +88,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GCN-NEXT: v_mov_b64_e32 v[12:13], 48
-; GCN-NEXT: v_mov_b64_e32 v[14:15], 32
-; GCN-NEXT: v_mov_b64_e32 v[16:17], 16
+; GCN-NEXT: v_mov_b64_e32 v[8:9], 48
+; GCN-NEXT: v_mov_b64_e32 v[10:11], 32
+; GCN-NEXT: v_mov_b64_e32 v[12:13], 16
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -112,42 +112,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0
; GCN-NEXT: v_accvgpr_write_b32 a13, s21
; GCN-NEXT: v_accvgpr_write_b32 a14, s22
; GCN-NEXT: v_accvgpr_write_b32 a15, s23
-; GCN-NEXT: v_mov_b64_e32 v[18:19], 0
-; GCN-NEXT: v_mov_b32_e32 v8, s16
+; GCN-NEXT: v_mov_b64_e32 v[14:15], 0
+; GCN-NEXT: v_mov_b32_e32 v16, s16
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
; GCN-NEXT: v_mov_b32_e32 v0, s20
; GCN-NEXT: v_mov_b32_e32 v1, s21
; GCN-NEXT: v_mov_b32_e32 v2, s22
; GCN-NEXT: v_mov_b32_e32 v3, s23
-; GCN-NEXT: v_mov_b32_e32 v9, s17
-; GCN-NEXT: v_mov_b32_e32 v10, s18
-; GCN-NEXT: v_mov_b32_e32 v11, s19
+; GCN-NEXT: v_mov_b32_e32 v17, s17
+; GCN-NEXT: v_mov_b32_e32 v18, s18
+; GCN-NEXT: v_mov_b32_e32 v19, s19
; GCN-NEXT: s_nop 4
-; GCN-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v2, s10
; GCN-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, s12
; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_mov_b32_e32 v2, s14
; GCN-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 2, i32 3, i32 1)
@@ -252,62 +252,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GCN-NEXT: v_mov_b32_e32 v12, 0
+; GCN-NEXT: v_mov_b32_e32 v44, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GCN-NEXT: v_accvgpr_write_b32 a31, s23
-; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GCN-NEXT: v_accvgpr_write_b32 a30, s22
-; GCN-NEXT: v_accvgpr_write_b32 a29, s21
-; GCN-NEXT: v_accvgpr_write_b32 a28, s20
-; GCN-NEXT: v_accvgpr_write_b32 a27, s19
-; GCN-NEXT: v_accvgpr_write_b32 a26, s18
-; GCN-NEXT: v_accvgpr_write_b32 a25, s17
-; GCN-NEXT: v_accvgpr_write_b32 a24, s16
-; GCN-NEXT: v_accvgpr_write_b32 a23, s15
-; GCN-NEXT: v_accvgpr_write_b32 a22, s14
-; GCN-NEXT: v_accvgpr_write_b32 a21, s13
-; GCN-NEXT: v_accvgpr_write_b32 a20, s12
-; GCN-NEXT: v_accvgpr_write_b32 a19, s11
-; GCN-NEXT: v_accvgpr_write_b32 a18, s10
-; GCN-NEXT: v_accvgpr_write_b32 a17, s9
-; GCN-NEXT: v_accvgpr_write_b32 a16, s8
-; GCN-NEXT: v_mov_b32_e32 v8, s20
-; GCN-NEXT: v_mov_b32_e32 v9, s21
-; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31]
-; GCN-NEXT: v_mov_b32_e32 v10, s22
-; GCN-NEXT: v_mov_b32_e32 v11, s23
-; GCN-NEXT: v_mov_b32_e32 v0, s16
-; GCN-NEXT: v_mov_b32_e32 v1, s17
-; GCN-NEXT: v_mov_b32_e32 v2, s18
-; GCN-NEXT: v_mov_b32_e32 v3, s19
-; GCN-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GCN-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; GCN-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GCN-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GCN-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GCN-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; GCN-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; GCN-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v40, s20
+; GCN-NEXT: v_mov_b32_e32 v41, s21
+; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31]
+; GCN-NEXT: v_mov_b32_e32 v42, s22
+; GCN-NEXT: v_mov_b32_e32 v43, s23
+; GCN-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 2
+; GCN-NEXT: v_mov_b32_e32 v16, s16
+; GCN-NEXT: v_mov_b32_e32 v17, s17
+; GCN-NEXT: v_mov_b32_e32 v18, s18
+; GCN-NEXT: v_mov_b32_e32 v19, s19
+; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v0, s12
-; GCN-NEXT: v_mov_b32_e32 v1, s13
-; GCN-NEXT: v_mov_b32_e32 v2, s14
-; GCN-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT: v_mov_b32_e32 v16, s12
+; GCN-NEXT: v_mov_b32_e32 v17, s13
+; GCN-NEXT: v_mov_b32_e32 v18, s14
+; GCN-NEXT: v_mov_b32_e32 v19, s15
+; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NEXT: v_mov_b32_e32 v2, s10
-; GCN-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; GCN-NEXT: v_mov_b32_e32 v16, s8
+; GCN-NEXT: v_mov_b32_e32 v17, s9
+; GCN-NEXT: v_mov_b32_e32 v18, s10
+; GCN-NEXT: v_mov_b32_e32 v19, s11
+; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
@@ -322,62 +315,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GCN-NEXT: v_mov_b32_e32 v12, 0
+; GCN-NEXT: v_mov_b32_e32 v44, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GCN-NEXT: v_accvgpr_write_b32 a31, s23
-; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GCN-NEXT: v_accvgpr_write_b32 a30, s22
-; GCN-NEXT: v_accvgpr_write_b32 a29, s21
-; GCN-NEXT: v_accvgpr_write_b32 a28, s20
-; GCN-NEXT: v_accvgpr_write_b32 a27, s19
-; GCN-NEXT: v_accvgpr_write_b32 a26, s18
-; GCN-NEXT: v_accvgpr_write_b32 a25, s17
-; GCN-NEXT: v_accvgpr_write_b32 a24, s16
-; GCN-NEXT: v_accvgpr_write_b32 a23, s15
-; GCN-NEXT: v_accvgpr_write_b32 a22, s14
-; GCN-NEXT: v_accvgpr_write_b32 a21, s13
-; GCN-NEXT: v_accvgpr_write_b32 a20, s12
-; GCN-NEXT: v_accvgpr_write_b32 a19, s11
-; GCN-NEXT: v_accvgpr_write_b32 a18, s10
-; GCN-NEXT: v_accvgpr_write_b32 a17, s9
-; GCN-NEXT: v_accvgpr_write_b32 a16, s8
-; GCN-NEXT: v_mov_b32_e32 v8, s20
-; GCN-NEXT: v_mov_b32_e32 v9, s21
-; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; GCN-NEXT: v_mov_b32_e32 v10, s22
-; GCN-NEXT: v_mov_b32_e32 v11, s23
-; GCN-NEXT: v_mov_b32_e32 v0, s16
-; GCN-NEXT: v_mov_b32_e32 v1, s17
-; GCN-NEXT: v_mov_b32_e32 v2, s18
-; GCN-NEXT: v_mov_b32_e32 v3, s19
-; GCN-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GCN-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; GCN-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GCN-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GCN-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GCN-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; GCN-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; GCN-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v40, s20
+; GCN-NEXT: v_mov_b32_e32 v41, s21
+; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; GCN-NEXT: v_mov_b32_e32 v42, s22
+; GCN-NEXT: v_mov_b32_e32 v43, s23
+; GCN-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 2
+; GCN-NEXT: v_mov_b32_e32 v16, s16
+; GCN-NEXT: v_mov_b32_e32 v17, s17
+; GCN-NEXT: v_mov_b32_e32 v18, s18
+; GCN-NEXT: v_mov_b32_e32 v19, s19
+; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v0, s12
-; GCN-NEXT: v_mov_b32_e32 v1, s13
-; GCN-NEXT: v_mov_b32_e32 v2, s14
-; GCN-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT: v_mov_b32_e32 v16, s12
+; GCN-NEXT: v_mov_b32_e32 v17, s13
+; GCN-NEXT: v_mov_b32_e32 v18, s14
+; GCN-NEXT: v_mov_b32_e32 v19, s15
+; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NEXT: v_mov_b32_e32 v2, s10
-; GCN-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; GCN-NEXT: v_mov_b32_e32 v16, s8
+; GCN-NEXT: v_mov_b32_e32 v17, s9
+; GCN-NEXT: v_mov_b32_e32 v18, s10
+; GCN-NEXT: v_mov_b32_e32 v19, s11
+; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 1, i32 2, i32 3)
@@ -393,35 +379,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat>
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GCN-NEXT: v_accvgpr_write_b32 a0, s8
-; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GCN-NEXT: v_accvgpr_write_b32 a1, s9
-; GCN-NEXT: v_accvgpr_write_b32 a2, s10
-; GCN-NEXT: v_accvgpr_write_b32 a3, s11
-; GCN-NEXT: v_accvgpr_write_b32 a4, s12
-; GCN-NEXT: v_accvgpr_write_b32 a5, s13
-; GCN-NEXT: v_accvgpr_write_b32 a6, s14
-; GCN-NEXT: v_accvgpr_write_b32 a7, s15
-; GCN-NEXT: v_accvgpr_write_b32 a8, s16
-; GCN-NEXT: v_accvgpr_write_b32 a9, s17
-; GCN-NEXT: v_accvgpr_write_b32 a10, s18
-; GCN-NEXT: v_accvgpr_write_b32 a11, s19
-; GCN-NEXT: v_accvgpr_write_b32 a12, s20
-; GCN-NEXT: v_accvgpr_write_b32 a13, s21
-; GCN-NEXT: v_accvgpr_write_b32 a14, s22
-; GCN-NEXT: v_accvgpr_write_b32 a15, s23
+; GCN-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; GCN-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GCN-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GCN-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GCN-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GCN-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GCN-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GCN-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
-; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15]
+; GCN-NEXT: v_mov_b32_e32 v16, 0
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 2
-; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; GCN-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
store <16 x float> %result, ptr addrspace(1) %out
@@ -435,40 +413,32 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GCN-NEXT: v_accvgpr_write_b32 a0, s8
-; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GCN-NEXT: v_accvgpr_write_b32 a1, s9
-; GCN-NEXT: v_accvgpr_write_b32 a2, s10
-; GCN-NEXT: v_accvgpr_write_b32 a3, s11
-; GCN-NEXT: v_accvgpr_write_b32 a4, s12
-; GCN-NEXT: v_accvgpr_write_b32 a5, s13
-; GCN-NEXT: v_accvgpr_write_b32 a6, s14
-; GCN-NEXT: v_accvgpr_write_b32 a7, s15
-; GCN-NEXT: v_accvgpr_write_b32 a8, s16
-; GCN-NEXT: v_accvgpr_write_b32 a9, s17
-; GCN-NEXT: v_accvgpr_write_b32 a10, s18
-; GCN-NEXT: v_accvgpr_write_b32 a11, s19
-; GCN-NEXT: v_accvgpr_write_b32 a12, s20
-; GCN-NEXT: v_accvgpr_write_b32 a13, s21
-; GCN-NEXT: v_accvgpr_write_b32 a14, s22
-; GCN-NEXT: v_accvgpr_write_b32 a15, s23
+; GCN-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; GCN-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GCN-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GCN-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GCN-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GCN-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GCN-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GCN-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; GCN-NEXT: v_mov_b32_e32 v16, 0
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 2
-; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; GCN-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 3, i32 2, i32 1)
store <16 x float> %result, ptr addrspace(1) %out
ret void
}
-attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
+attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" }
attributes #1 = { "amdgpu-flat-work-group-size"="1,64" }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 9bdae28f..d81ec1c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -141,20 +141,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
@@ -166,16 +164,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
@@ -183,20 +179,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v12, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
@@ -266,20 +260,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
@@ -291,16 +283,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
@@ -308,20 +298,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v12, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
@@ -394,9 +382,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], 48
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], 32
-; SDAG-NEXT: v_mov_b64_e32 v[16:17], 16
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -418,42 +406,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: v_mov_b64_e32 v[18:19], 0
-; SDAG-NEXT: v_mov_b32_e32 v8, s16
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
; SDAG-NEXT: v_mov_b32_e32 v0, s20
; SDAG-NEXT: v_mov_b32_e32 v1, s21
; SDAG-NEXT: v_mov_b32_e32 v2, s22
; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b32_e32 v9, s17
-; SDAG-NEXT: v_mov_b32_e32 v10, s18
-; SDAG-NEXT: v_mov_b32_e32 v11, s19
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
; SDAG-NEXT: s_nop 4
-; SDAG-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
; SDAG-NEXT: v_mov_b32_e32 v2, s10
; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -518,9 +506,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 48
-; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 32
-; HEURRC-NEXT: v_mov_b64_e32 v[16:17], 16
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -542,42 +530,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[18:19], 0
-; HEURRC-NEXT: v_mov_b32_e32 v8, s16
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0
+; HEURRC-NEXT: v_mov_b32_e32 v16, s16
; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
; HEURRC-NEXT: v_mov_b32_e32 v0, s20
; HEURRC-NEXT: v_mov_b32_e32 v1, s21
; HEURRC-NEXT: v_mov_b32_e32 v2, s22
; HEURRC-NEXT: v_mov_b32_e32 v3, s23
-; HEURRC-NEXT: v_mov_b32_e32 v9, s17
-; HEURRC-NEXT: v_mov_b32_e32 v10, s18
-; HEURRC-NEXT: v_mov_b32_e32 v11, s19
+; HEURRC-NEXT: v_mov_b32_e32 v17, s17
+; HEURRC-NEXT: v_mov_b32_e32 v18, s18
+; HEURRC-NEXT: v_mov_b32_e32 v19, s19
; HEURRC-NEXT: s_nop 4
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
; HEURRC-NEXT: v_mov_b32_e32 v2, s10
; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s12
; HEURRC-NEXT: v_mov_b32_e32 v1, s13
; HEURRC-NEXT: v_mov_b32_e32 v2, s14
; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -585,9 +573,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 48
-; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 32
-; VGPRRC-NEXT: v_mov_b64_e32 v[48:49], 16
+; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48
+; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32
+; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
@@ -601,43 +589,43 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; VGPRRC-NEXT: v_mov_b64_e32 v[50:51], 0
-; VGPRRC-NEXT: v_mov_b32_e32 v40, s16
+; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
+; VGPRRC-NEXT: v_mov_b32_e32 v48, s16
; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15]
-; VGPRRC-NEXT: v_mov_b32_e32 v41, s17
-; VGPRRC-NEXT: v_mov_b32_e32 v42, s18
-; VGPRRC-NEXT: v_mov_b32_e32 v43, s19
+; VGPRRC-NEXT: v_mov_b32_e32 v49, s17
+; VGPRRC-NEXT: v_mov_b32_e32 v50, s18
+; VGPRRC-NEXT: v_mov_b32_e32 v51, s19
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 0
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[28:31], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[24:27], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[48:49], v[20:23], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[50:51], v[16:19], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[40:43], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: v_mov_b32_e32 v0, s20
; VGPRRC-NEXT: v_mov_b32_e32 v1, s21
; VGPRRC-NEXT: v_mov_b32_e32 v2, s22
; VGPRRC-NEXT: v_mov_b32_e32 v3, s23
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s12
; VGPRRC-NEXT: v_mov_b32_e32 v1, s13
; VGPRRC-NEXT: v_mov_b32_e32 v2, s14
; VGPRRC-NEXT: v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_32x32x16_f16:
@@ -776,9 +764,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], 48
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], 32
-; SDAG-NEXT: v_mov_b64_e32 v[16:17], 16
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -800,42 +788,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: v_mov_b64_e32 v[18:19], 0
-; SDAG-NEXT: v_mov_b32_e32 v8, s16
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
; SDAG-NEXT: v_mov_b32_e32 v0, s20
; SDAG-NEXT: v_mov_b32_e32 v1, s21
; SDAG-NEXT: v_mov_b32_e32 v2, s22
; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b32_e32 v9, s17
-; SDAG-NEXT: v_mov_b32_e32 v10, s18
-; SDAG-NEXT: v_mov_b32_e32 v11, s19
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
; SDAG-NEXT: s_nop 4
-; SDAG-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
; SDAG-NEXT: v_mov_b32_e32 v2, s10
; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -900,9 +888,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 48
-; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 32
-; HEURRC-NEXT: v_mov_b64_e32 v[16:17], 16
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -924,42 +912,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[18:19], 0
-; HEURRC-NEXT: v_mov_b32_e32 v8, s16
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0
+; HEURRC-NEXT: v_mov_b32_e32 v16, s16
; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
; HEURRC-NEXT: v_mov_b32_e32 v0, s20
; HEURRC-NEXT: v_mov_b32_e32 v1, s21
; HEURRC-NEXT: v_mov_b32_e32 v2, s22
; HEURRC-NEXT: v_mov_b32_e32 v3, s23
-; HEURRC-NEXT: v_mov_b32_e32 v9, s17
-; HEURRC-NEXT: v_mov_b32_e32 v10, s18
-; HEURRC-NEXT: v_mov_b32_e32 v11, s19
+; HEURRC-NEXT: v_mov_b32_e32 v17, s17
+; HEURRC-NEXT: v_mov_b32_e32 v18, s18
+; HEURRC-NEXT: v_mov_b32_e32 v19, s19
; HEURRC-NEXT: s_nop 4
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
; HEURRC-NEXT: v_mov_b32_e32 v2, s10
; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s12
; HEURRC-NEXT: v_mov_b32_e32 v1, s13
; HEURRC-NEXT: v_mov_b32_e32 v2, s14
; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -967,9 +955,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 48
-; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 32
-; VGPRRC-NEXT: v_mov_b64_e32 v[48:49], 16
+; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48
+; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32
+; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
@@ -983,43 +971,43 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; VGPRRC-NEXT: v_mov_b64_e32 v[50:51], 0
-; VGPRRC-NEXT: v_mov_b32_e32 v40, s16
+; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
+; VGPRRC-NEXT: v_mov_b32_e32 v48, s16
; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1
-; VGPRRC-NEXT: v_mov_b32_e32 v41, s17
-; VGPRRC-NEXT: v_mov_b32_e32 v42, s18
-; VGPRRC-NEXT: v_mov_b32_e32 v43, s19
+; VGPRRC-NEXT: v_mov_b32_e32 v49, s17
+; VGPRRC-NEXT: v_mov_b32_e32 v50, s18
+; VGPRRC-NEXT: v_mov_b32_e32 v51, s19
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 0
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[28:31], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[24:27], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[48:49], v[20:23], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[50:51], v[16:19], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[40:43], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: v_mov_b32_e32 v0, s20
; VGPRRC-NEXT: v_mov_b32_e32 v1, s21
; VGPRRC-NEXT: v_mov_b32_e32 v2, s22
; VGPRRC-NEXT: v_mov_b32_e32 v3, s23
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s12
; VGPRRC-NEXT: v_mov_b32_e32 v1, s13
; VGPRRC-NEXT: v_mov_b32_e32 v2, s14
; VGPRRC-NEXT: v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_32x32x16_f16__flags:
@@ -1505,62 +1493,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT: v_mov_b32_e32 v12, 0
+; SDAG-NEXT: v_mov_b32_e32 v44, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a29, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a28, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a27, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a26, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a25, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a24, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a23, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a22, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a21, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a20, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a19, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
-; SDAG-NEXT: v_mov_b32_e32 v9, s21
-; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31]
-; SDAG-NEXT: v_mov_b32_e32 v10, s22
-; SDAG-NEXT: v_mov_b32_e32 v11, s23
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; SDAG-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; SDAG-NEXT: v_mov_b32_e32 v40, s20
+; SDAG-NEXT: v_mov_b32_e32 v41, s21
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
+; SDAG-NEXT: v_mov_b32_e32 v42, s22
+; SDAG-NEXT: v_mov_b32_e32 v43, s23
+; SDAG-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: s_nop 2
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -1569,52 +1550,44 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GISEL-NEXT: v_mov_b32_e32 v24, 0
+; GISEL-NEXT: v_mov_b32_e32 v56, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
+; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15]
+; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -1623,62 +1596,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT: v_mov_b32_e32 v12, 0
+; HEURRC-NEXT: v_mov_b32_e32 v44, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
-; HEURRC-NEXT: v_mov_b32_e32 v8, s20
-; HEURRC-NEXT: v_mov_b32_e32 v9, s21
-; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31]
-; HEURRC-NEXT: v_mov_b32_e32 v10, s22
-; HEURRC-NEXT: v_mov_b32_e32 v11, s23
-; HEURRC-NEXT: v_mov_b32_e32 v0, s16
-; HEURRC-NEXT: v_mov_b32_e32 v1, s17
-; HEURRC-NEXT: v_mov_b32_e32 v2, s18
-; HEURRC-NEXT: v_mov_b32_e32 v3, s19
-; HEURRC-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; HEURRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; HEURRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; HEURRC-NEXT: v_mov_b32_e32 v40, s20
+; HEURRC-NEXT: v_mov_b32_e32 v41, s21
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
+; HEURRC-NEXT: v_mov_b32_e32 v42, s22
+; HEURRC-NEXT: v_mov_b32_e32 v43, s23
+; HEURRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: s_nop 2
+; HEURRC-NEXT: v_mov_b32_e32 v16, s16
+; HEURRC-NEXT: v_mov_b32_e32 v17, s17
+; HEURRC-NEXT: v_mov_b32_e32 v18, s18
+; HEURRC-NEXT: v_mov_b32_e32 v19, s19
+; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s12
-; HEURRC-NEXT: v_mov_b32_e32 v1, s13
-; HEURRC-NEXT: v_mov_b32_e32 v2, s14
-; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s12
+; HEURRC-NEXT: v_mov_b32_e32 v17, s13
+; HEURRC-NEXT: v_mov_b32_e32 v18, s14
+; HEURRC-NEXT: v_mov_b32_e32 v19, s15
+; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s8
+; HEURRC-NEXT: v_mov_b32_e32 v17, s9
+; HEURRC-NEXT: v_mov_b32_e32 v18, s10
+; HEURRC-NEXT: v_mov_b32_e32 v19, s11
+; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -1869,62 +1835,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT: v_mov_b32_e32 v12, 0
+; SDAG-NEXT: v_mov_b32_e32 v44, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a29, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a28, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a27, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a26, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a25, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a24, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a23, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a22, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a21, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a20, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a19, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
-; SDAG-NEXT: v_mov_b32_e32 v9, s21
-; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; SDAG-NEXT: v_mov_b32_e32 v10, s22
-; SDAG-NEXT: v_mov_b32_e32 v11, s23
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; SDAG-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; SDAG-NEXT: v_mov_b32_e32 v40, s20
+; SDAG-NEXT: v_mov_b32_e32 v41, s21
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; SDAG-NEXT: v_mov_b32_e32 v42, s22
+; SDAG-NEXT: v_mov_b32_e32 v43, s23
+; SDAG-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: s_nop 2
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -1933,52 +1892,44 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GISEL-NEXT: v_mov_b32_e32 v24, 0
+; GISEL-NEXT: v_mov_b32_e32 v56, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:1 abid:2 blgp:3
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
+; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:1 abid:2 blgp:3
+; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -1987,62 +1938,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT: v_mov_b32_e32 v12, 0
+; HEURRC-NEXT: v_mov_b32_e32 v44, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
-; HEURRC-NEXT: v_mov_b32_e32 v8, s20
-; HEURRC-NEXT: v_mov_b32_e32 v9, s21
-; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; HEURRC-NEXT: v_mov_b32_e32 v10, s22
-; HEURRC-NEXT: v_mov_b32_e32 v11, s23
-; HEURRC-NEXT: v_mov_b32_e32 v0, s16
-; HEURRC-NEXT: v_mov_b32_e32 v1, s17
-; HEURRC-NEXT: v_mov_b32_e32 v2, s18
-; HEURRC-NEXT: v_mov_b32_e32 v3, s19
-; HEURRC-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; HEURRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; HEURRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; HEURRC-NEXT: v_mov_b32_e32 v40, s20
+; HEURRC-NEXT: v_mov_b32_e32 v41, s21
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; HEURRC-NEXT: v_mov_b32_e32 v42, s22
+; HEURRC-NEXT: v_mov_b32_e32 v43, s23
+; HEURRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: s_nop 2
+; HEURRC-NEXT: v_mov_b32_e32 v16, s16
+; HEURRC-NEXT: v_mov_b32_e32 v17, s17
+; HEURRC-NEXT: v_mov_b32_e32 v18, s18
+; HEURRC-NEXT: v_mov_b32_e32 v19, s19
+; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s12
-; HEURRC-NEXT: v_mov_b32_e32 v1, s13
-; HEURRC-NEXT: v_mov_b32_e32 v2, s14
-; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s12
+; HEURRC-NEXT: v_mov_b32_e32 v17, s13
+; HEURRC-NEXT: v_mov_b32_e32 v18, s14
+; HEURRC-NEXT: v_mov_b32_e32 v19, s15
+; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s8
+; HEURRC-NEXT: v_mov_b32_e32 v17, s9
+; HEURRC-NEXT: v_mov_b32_e32 v18, s10
+; HEURRC-NEXT: v_mov_b32_e32 v19, s11
+; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -2234,35 +2178,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
@@ -2271,35 +2207,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
@@ -2308,35 +2236,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
-; HEURRC-NEXT: v_mov_b32_e32 v0, 0
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
+; HEURRC-NEXT: v_mov_b32_e32 v16, 0
; HEURRC-NEXT: s_nop 7
; HEURRC-NEXT: s_nop 2
-; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
@@ -2443,35 +2363,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
@@ -2480,35 +2392,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
@@ -2517,35 +2421,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; HEURRC-NEXT: v_mov_b32_e32 v0, 0
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: v_mov_b32_e32 v16, 0
; HEURRC-NEXT: s_nop 7
; HEURRC-NEXT: s_nop 2
-; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
@@ -2781,7 +2677,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
@@ -2791,14 +2687,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; SDAG-NEXT: v_mov_b32_e32 v5, s13
; SDAG-NEXT: v_mov_b32_e32 v6, s14
; SDAG-NEXT: v_mov_b32_e32 v7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_mov_b32_e32 v8, s0
+; SDAG-NEXT: v_mov_b32_e32 v9, s1
+; SDAG-NEXT: v_mov_b32_e32 v10, s2
+; SDAG-NEXT: v_mov_b32_e32 v11, s3
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
+; SDAG-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
@@ -2810,16 +2706,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
@@ -2827,7 +2721,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v12, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
@@ -2837,14 +2731,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; HEURRC-NEXT: v_mov_b32_e32 v5, s13
; HEURRC-NEXT: v_mov_b32_e32 v6, s14
; HEURRC-NEXT: v_mov_b32_e32 v7, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: v_mov_b32_e32 v8, s0
+; HEURRC-NEXT: v_mov_b32_e32 v9, s1
+; HEURRC-NEXT: v_mov_b32_e32 v10, s2
+; HEURRC-NEXT: v_mov_b32_e32 v11, s3
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
@@ -2930,7 +2824,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
@@ -2940,14 +2834,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; SDAG-NEXT: v_mov_b32_e32 v5, s13
; SDAG-NEXT: v_mov_b32_e32 v6, s14
; SDAG-NEXT: v_mov_b32_e32 v7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_mov_b32_e32 v8, s0
+; SDAG-NEXT: v_mov_b32_e32 v9, s1
+; SDAG-NEXT: v_mov_b32_e32 v10, s2
+; SDAG-NEXT: v_mov_b32_e32 v11, s3
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
@@ -2959,16 +2853,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
@@ -2976,7 +2868,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v12, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
@@ -2986,14 +2878,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; HEURRC-NEXT: v_mov_b32_e32 v5, s13
; HEURRC-NEXT: v_mov_b32_e32 v6, s14
; HEURRC-NEXT: v_mov_b32_e32 v7, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: v_mov_b32_e32 v8, s0
+; HEURRC-NEXT: v_mov_b32_e32 v9, s1
+; HEURRC-NEXT: v_mov_b32_e32 v10, s2
+; HEURRC-NEXT: v_mov_b32_e32 v11, s3
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
@@ -3084,19 +2976,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s24
-; SDAG-NEXT: v_mov_b32_e32 v1, s25
-; SDAG-NEXT: v_mov_b32_e32 v2, s26
-; SDAG-NEXT: v_mov_b32_e32 v3, s27
+; SDAG-NEXT: v_mov_b32_e32 v8, s24
+; SDAG-NEXT: v_mov_b32_e32 v9, s25
+; SDAG-NEXT: v_mov_b32_e32 v10, s26
+; SDAG-NEXT: v_mov_b32_e32 v11, s27
; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b32_e32 v4, s28
-; SDAG-NEXT: v_mov_b32_e32 v5, s29
-; SDAG-NEXT: v_mov_b32_e32 v6, s30
-; SDAG-NEXT: v_mov_b32_e32 v7, s31
+; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v13, s29
+; SDAG-NEXT: v_mov_b32_e32 v14, s30
+; SDAG-NEXT: v_mov_b32_e32 v15, s31
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
@@ -3112,44 +3004,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15]
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15]
+; SDAG-NEXT: v_mov_b32_e32 v8, s16
+; SDAG-NEXT: v_mov_b32_e32 v9, s17
+; SDAG-NEXT: v_mov_b32_e32 v10, s18
+; SDAG-NEXT: v_mov_b32_e32 v11, s19
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: v_mov_b32_e32 v8, s20
+; SDAG-NEXT: v_mov_b32_e32 v9, s21
+; SDAG-NEXT: v_mov_b32_e32 v10, s22
+; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -3214,19 +3104,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48
-; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32
-; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], 48
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], 32
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], 16
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s24
-; HEURRC-NEXT: v_mov_b32_e32 v1, s25
-; HEURRC-NEXT: v_mov_b32_e32 v2, s26
-; HEURRC-NEXT: v_mov_b32_e32 v3, s27
+; HEURRC-NEXT: v_mov_b32_e32 v8, s24
+; HEURRC-NEXT: v_mov_b32_e32 v9, s25
+; HEURRC-NEXT: v_mov_b32_e32 v10, s26
+; HEURRC-NEXT: v_mov_b32_e32 v11, s27
; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v4, s28
-; HEURRC-NEXT: v_mov_b32_e32 v5, s29
-; HEURRC-NEXT: v_mov_b32_e32 v6, s30
-; HEURRC-NEXT: v_mov_b32_e32 v7, s31
+; HEURRC-NEXT: v_mov_b32_e32 v12, s28
+; HEURRC-NEXT: v_mov_b32_e32 v13, s29
+; HEURRC-NEXT: v_mov_b32_e32 v14, s30
+; HEURRC-NEXT: v_mov_b32_e32 v15, s31
; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
@@ -3242,44 +3132,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], 0
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15]
-; HEURRC-NEXT: v_mov_b32_e32 v0, s16
-; HEURRC-NEXT: v_mov_b32_e32 v1, s17
-; HEURRC-NEXT: v_mov_b32_e32 v2, s18
-; HEURRC-NEXT: v_mov_b32_e32 v3, s19
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15]
+; HEURRC-NEXT: v_mov_b32_e32 v8, s16
+; HEURRC-NEXT: v_mov_b32_e32 v9, s17
+; HEURRC-NEXT: v_mov_b32_e32 v10, s18
+; HEURRC-NEXT: v_mov_b32_e32 v11, s19
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
-; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v2, s10
+; HEURRC-NEXT: v_mov_b32_e32 v3, s11
+; HEURRC-NEXT: v_mov_b32_e32 v8, s20
+; HEURRC-NEXT: v_mov_b32_e32 v9, s21
+; HEURRC-NEXT: v_mov_b32_e32 v10, s22
+; HEURRC-NEXT: v_mov_b32_e32 v11, s23
+; HEURRC-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s12
; HEURRC-NEXT: v_mov_b32_e32 v1, s13
; HEURRC-NEXT: v_mov_b32_e32 v2, s14
; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -3287,19 +3175,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48
-; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32
-; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16
+; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], 48
+; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], 32
+; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], 16
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v32, s24
-; VGPRRC-NEXT: v_mov_b32_e32 v33, s25
-; VGPRRC-NEXT: v_mov_b32_e32 v34, s26
-; VGPRRC-NEXT: v_mov_b32_e32 v35, s27
+; VGPRRC-NEXT: v_mov_b32_e32 v40, s24
+; VGPRRC-NEXT: v_mov_b32_e32 v41, s25
+; VGPRRC-NEXT: v_mov_b32_e32 v42, s26
+; VGPRRC-NEXT: v_mov_b32_e32 v43, s27
; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; VGPRRC-NEXT: v_mov_b32_e32 v36, s28
-; VGPRRC-NEXT: v_mov_b32_e32 v37, s29
-; VGPRRC-NEXT: v_mov_b32_e32 v38, s30
-; VGPRRC-NEXT: v_mov_b32_e32 v39, s31
+; VGPRRC-NEXT: v_mov_b32_e32 v44, s28
+; VGPRRC-NEXT: v_mov_b32_e32 v45, s29
+; VGPRRC-NEXT: v_mov_b32_e32 v46, s30
+; VGPRRC-NEXT: v_mov_b32_e32 v47, s31
; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
@@ -3307,45 +3195,45 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
+; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], 0
; VGPRRC-NEXT: s_nop 0
-; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15]
+; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[40:43], v[44:47], v[0:15]
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 3
-; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[28:31], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[24:27], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[20:23], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[16:19], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: v_mov_b32_e32 v0, s16
; VGPRRC-NEXT: v_mov_b32_e32 v1, s17
; VGPRRC-NEXT: v_mov_b32_e32 v2, s18
; VGPRRC-NEXT: v_mov_b32_e32 v3, s19
-; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s20
; VGPRRC-NEXT: v_mov_b32_e32 v1, s21
; VGPRRC-NEXT: v_mov_b32_e32 v2, s22
; VGPRRC-NEXT: v_mov_b32_e32 v3, s23
-; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s12
; VGPRRC-NEXT: v_mov_b32_e32 v1, s13
; VGPRRC-NEXT: v_mov_b32_e32 v2, s14
; VGPRRC-NEXT: v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_32x32x32_i8:
@@ -3496,19 +3384,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s24
-; SDAG-NEXT: v_mov_b32_e32 v1, s25
-; SDAG-NEXT: v_mov_b32_e32 v2, s26
-; SDAG-NEXT: v_mov_b32_e32 v3, s27
+; SDAG-NEXT: v_mov_b32_e32 v8, s24
+; SDAG-NEXT: v_mov_b32_e32 v9, s25
+; SDAG-NEXT: v_mov_b32_e32 v10, s26
+; SDAG-NEXT: v_mov_b32_e32 v11, s27
; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b32_e32 v4, s28
-; SDAG-NEXT: v_mov_b32_e32 v5, s29
-; SDAG-NEXT: v_mov_b32_e32 v6, s30
-; SDAG-NEXT: v_mov_b32_e32 v7, s31
+; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v13, s29
+; SDAG-NEXT: v_mov_b32_e32 v14, s30
+; SDAG-NEXT: v_mov_b32_e32 v15, s31
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
@@ -3524,44 +3412,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1
+; SDAG-NEXT: v_mov_b32_e32 v8, s16
+; SDAG-NEXT: v_mov_b32_e32 v9, s17
+; SDAG-NEXT: v_mov_b32_e32 v10, s18
+; SDAG-NEXT: v_mov_b32_e32 v11, s19
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: v_mov_b32_e32 v8, s20
+; SDAG-NEXT: v_mov_b32_e32 v9, s21
+; SDAG-NEXT: v_mov_b32_e32 v10, s22
+; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -3626,19 +3512,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48
-; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32
-; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], 48
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], 32
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], 16
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s24
-; HEURRC-NEXT: v_mov_b32_e32 v1, s25
-; HEURRC-NEXT: v_mov_b32_e32 v2, s26
-; HEURRC-NEXT: v_mov_b32_e32 v3, s27
+; HEURRC-NEXT: v_mov_b32_e32 v8, s24
+; HEURRC-NEXT: v_mov_b32_e32 v9, s25
+; HEURRC-NEXT: v_mov_b32_e32 v10, s26
+; HEURRC-NEXT: v_mov_b32_e32 v11, s27
; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v4, s28
-; HEURRC-NEXT: v_mov_b32_e32 v5, s29
-; HEURRC-NEXT: v_mov_b32_e32 v6, s30
-; HEURRC-NEXT: v_mov_b32_e32 v7, s31
+; HEURRC-NEXT: v_mov_b32_e32 v12, s28
+; HEURRC-NEXT: v_mov_b32_e32 v13, s29
+; HEURRC-NEXT: v_mov_b32_e32 v14, s30
+; HEURRC-NEXT: v_mov_b32_e32 v15, s31
; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
@@ -3654,44 +3540,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], 0
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
-; HEURRC-NEXT: v_mov_b32_e32 v0, s16
-; HEURRC-NEXT: v_mov_b32_e32 v1, s17
-; HEURRC-NEXT: v_mov_b32_e32 v2, s18
-; HEURRC-NEXT: v_mov_b32_e32 v3, s19
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1
+; HEURRC-NEXT: v_mov_b32_e32 v8, s16
+; HEURRC-NEXT: v_mov_b32_e32 v9, s17
+; HEURRC-NEXT: v_mov_b32_e32 v10, s18
+; HEURRC-NEXT: v_mov_b32_e32 v11, s19
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
-; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v2, s10
+; HEURRC-NEXT: v_mov_b32_e32 v3, s11
+; HEURRC-NEXT: v_mov_b32_e32 v8, s20
+; HEURRC-NEXT: v_mov_b32_e32 v9, s21
+; HEURRC-NEXT: v_mov_b32_e32 v10, s22
+; HEURRC-NEXT: v_mov_b32_e32 v11, s23
+; HEURRC-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s8
; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
; HEURRC-NEXT: v_mov_b32_e32 v0, s12
; HEURRC-NEXT: v_mov_b32_e32 v1, s13
; HEURRC-NEXT: v_mov_b32_e32 v2, s14
; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -3699,19 +3583,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48
-; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32
-; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16
+; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], 48
+; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], 32
+; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], 16
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v32, s24
-; VGPRRC-NEXT: v_mov_b32_e32 v33, s25
-; VGPRRC-NEXT: v_mov_b32_e32 v34, s26
-; VGPRRC-NEXT: v_mov_b32_e32 v35, s27
+; VGPRRC-NEXT: v_mov_b32_e32 v40, s24
+; VGPRRC-NEXT: v_mov_b32_e32 v41, s25
+; VGPRRC-NEXT: v_mov_b32_e32 v42, s26
+; VGPRRC-NEXT: v_mov_b32_e32 v43, s27
; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; VGPRRC-NEXT: v_mov_b32_e32 v36, s28
-; VGPRRC-NEXT: v_mov_b32_e32 v37, s29
-; VGPRRC-NEXT: v_mov_b32_e32 v38, s30
-; VGPRRC-NEXT: v_mov_b32_e32 v39, s31
+; VGPRRC-NEXT: v_mov_b32_e32 v44, s28
+; VGPRRC-NEXT: v_mov_b32_e32 v45, s29
+; VGPRRC-NEXT: v_mov_b32_e32 v46, s30
+; VGPRRC-NEXT: v_mov_b32_e32 v47, s31
; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
@@ -3719,45 +3603,45 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
+; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], 0
; VGPRRC-NEXT: s_nop 0
-; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1
+; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[40:43], v[44:47], v[0:15] cbsz:2 abid:3 blgp:1
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 3
-; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[28:31], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[24:27], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[20:23], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[16:19], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: v_mov_b32_e32 v0, s16
; VGPRRC-NEXT: v_mov_b32_e32 v1, s17
; VGPRRC-NEXT: v_mov_b32_e32 v2, s18
; VGPRRC-NEXT: v_mov_b32_e32 v3, s19
-; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s20
; VGPRRC-NEXT: v_mov_b32_e32 v1, s21
; VGPRRC-NEXT: v_mov_b32_e32 v2, s22
; VGPRRC-NEXT: v_mov_b32_e32 v3, s23
-; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s12
; VGPRRC-NEXT: v_mov_b32_e32 v1, s13
; VGPRRC-NEXT: v_mov_b32_e32 v2, s14
; VGPRRC-NEXT: v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_32x32x32_i8__flags:
@@ -4254,70 +4138,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v40, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b32_e32 v32, s20
+; SDAG-NEXT: v_mov_b32_e32 v33, s21
+; SDAG-NEXT: v_mov_b32_e32 v34, s22
+; SDAG-NEXT: v_mov_b32_e32 v35, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b32_e32 v4, s24
-; SDAG-NEXT: v_mov_b32_e32 v5, s25
-; SDAG-NEXT: v_mov_b32_e32 v6, s26
-; SDAG-NEXT: v_mov_b32_e32 v7, s27
+; SDAG-NEXT: v_mov_b32_e32 v36, s24
+; SDAG-NEXT: v_mov_b32_e32 v37, s25
+; SDAG-NEXT: v_mov_b32_e32 v38, s26
+; SDAG-NEXT: v_mov_b32_e32 v39, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a29, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a28, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a27, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a26, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a25, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a24, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a23, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a22, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a21, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a20, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a19, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
+; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31]
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
+; SDAG-NEXT: s_nop 6
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -4326,52 +4203,44 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GISEL-NEXT: v_mov_b32_e32 v24, 0
+; GISEL-NEXT: v_mov_b32_e32 v56, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
+; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15]
+; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -4379,70 +4248,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v40, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
+; HEURRC-NEXT: v_mov_b32_e32 v32, s20
+; HEURRC-NEXT: v_mov_b32_e32 v33, s21
+; HEURRC-NEXT: v_mov_b32_e32 v34, s22
+; HEURRC-NEXT: v_mov_b32_e32 v35, s23
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b32_e32 v4, s24
-; HEURRC-NEXT: v_mov_b32_e32 v5, s25
-; HEURRC-NEXT: v_mov_b32_e32 v6, s26
-; HEURRC-NEXT: v_mov_b32_e32 v7, s27
+; HEURRC-NEXT: v_mov_b32_e32 v36, s24
+; HEURRC-NEXT: v_mov_b32_e32 v37, s25
+; HEURRC-NEXT: v_mov_b32_e32 v38, s26
+; HEURRC-NEXT: v_mov_b32_e32 v39, s27
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23
-; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
+; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31]
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
+; HEURRC-NEXT: s_nop 6
+; HEURRC-NEXT: v_mov_b32_e32 v16, s20
+; HEURRC-NEXT: v_mov_b32_e32 v17, s21
+; HEURRC-NEXT: v_mov_b32_e32 v18, s22
+; HEURRC-NEXT: v_mov_b32_e32 v19, s23
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s16
-; HEURRC-NEXT: v_mov_b32_e32 v1, s17
-; HEURRC-NEXT: v_mov_b32_e32 v2, s18
-; HEURRC-NEXT: v_mov_b32_e32 v3, s19
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s16
+; HEURRC-NEXT: v_mov_b32_e32 v17, s17
+; HEURRC-NEXT: v_mov_b32_e32 v18, s18
+; HEURRC-NEXT: v_mov_b32_e32 v19, s19
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s12
-; HEURRC-NEXT: v_mov_b32_e32 v1, s13
-; HEURRC-NEXT: v_mov_b32_e32 v2, s14
-; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s12
+; HEURRC-NEXT: v_mov_b32_e32 v17, s13
+; HEURRC-NEXT: v_mov_b32_e32 v18, s14
+; HEURRC-NEXT: v_mov_b32_e32 v19, s15
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s8
+; HEURRC-NEXT: v_mov_b32_e32 v17, s9
+; HEURRC-NEXT: v_mov_b32_e32 v18, s10
+; HEURRC-NEXT: v_mov_b32_e32 v19, s11
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -4653,70 +4515,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v40, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b32_e32 v32, s20
+; SDAG-NEXT: v_mov_b32_e32 v33, s21
+; SDAG-NEXT: v_mov_b32_e32 v34, s22
+; SDAG-NEXT: v_mov_b32_e32 v35, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b32_e32 v4, s24
-; SDAG-NEXT: v_mov_b32_e32 v5, s25
-; SDAG-NEXT: v_mov_b32_e32 v6, s26
-; SDAG-NEXT: v_mov_b32_e32 v7, s27
+; SDAG-NEXT: v_mov_b32_e32 v36, s24
+; SDAG-NEXT: v_mov_b32_e32 v37, s25
+; SDAG-NEXT: v_mov_b32_e32 v38, s26
+; SDAG-NEXT: v_mov_b32_e32 v39, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a29, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a28, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a27, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a26, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a25, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a24, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a23, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a22, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a21, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a20, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a19, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
+; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; SDAG-NEXT: s_nop 6
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -4725,52 +4580,44 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GISEL-NEXT: v_mov_b32_e32 v24, 0
+; GISEL-NEXT: v_mov_b32_e32 v56, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:1 abid:2 blgp:3
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
+; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:1 abid:2 blgp:3
+; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -4778,70 +4625,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v40, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
+; HEURRC-NEXT: v_mov_b32_e32 v32, s20
+; HEURRC-NEXT: v_mov_b32_e32 v33, s21
+; HEURRC-NEXT: v_mov_b32_e32 v34, s22
+; HEURRC-NEXT: v_mov_b32_e32 v35, s23
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b32_e32 v4, s24
-; HEURRC-NEXT: v_mov_b32_e32 v5, s25
-; HEURRC-NEXT: v_mov_b32_e32 v6, s26
-; HEURRC-NEXT: v_mov_b32_e32 v7, s27
+; HEURRC-NEXT: v_mov_b32_e32 v36, s24
+; HEURRC-NEXT: v_mov_b32_e32 v37, s25
+; HEURRC-NEXT: v_mov_b32_e32 v38, s26
+; HEURRC-NEXT: v_mov_b32_e32 v39, s27
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23
-; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
+; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; HEURRC-NEXT: s_nop 6
+; HEURRC-NEXT: v_mov_b32_e32 v16, s20
+; HEURRC-NEXT: v_mov_b32_e32 v17, s21
+; HEURRC-NEXT: v_mov_b32_e32 v18, s22
+; HEURRC-NEXT: v_mov_b32_e32 v19, s23
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s16
-; HEURRC-NEXT: v_mov_b32_e32 v1, s17
-; HEURRC-NEXT: v_mov_b32_e32 v2, s18
-; HEURRC-NEXT: v_mov_b32_e32 v3, s19
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s16
+; HEURRC-NEXT: v_mov_b32_e32 v17, s17
+; HEURRC-NEXT: v_mov_b32_e32 v18, s18
+; HEURRC-NEXT: v_mov_b32_e32 v19, s19
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s12
-; HEURRC-NEXT: v_mov_b32_e32 v1, s13
-; HEURRC-NEXT: v_mov_b32_e32 v2, s14
-; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s12
+; HEURRC-NEXT: v_mov_b32_e32 v17, s13
+; HEURRC-NEXT: v_mov_b32_e32 v18, s14
+; HEURRC-NEXT: v_mov_b32_e32 v19, s15
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s8
+; HEURRC-NEXT: v_mov_b32_e32 v17, s9
+; HEURRC-NEXT: v_mov_b32_e32 v18, s10
+; HEURRC-NEXT: v_mov_b32_e32 v19, s11
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -5053,41 +4893,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b32_e32 v4, s24
-; SDAG-NEXT: v_mov_b32_e32 v5, s25
-; SDAG-NEXT: v_mov_b32_e32 v6, s26
-; SDAG-NEXT: v_mov_b32_e32 v7, s27
+; SDAG-NEXT: v_mov_b32_e32 v20, s24
+; SDAG-NEXT: v_mov_b32_e32 v21, s25
+; SDAG-NEXT: v_mov_b32_e32 v22, s26
+; SDAG-NEXT: v_mov_b32_e32 v23, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
@@ -5096,35 +4928,27 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
@@ -5132,41 +4956,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
+; HEURRC-NEXT: v_mov_b32_e32 v16, s20
+; HEURRC-NEXT: v_mov_b32_e32 v17, s21
+; HEURRC-NEXT: v_mov_b32_e32 v18, s22
+; HEURRC-NEXT: v_mov_b32_e32 v19, s23
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b32_e32 v4, s24
-; HEURRC-NEXT: v_mov_b32_e32 v5, s25
-; HEURRC-NEXT: v_mov_b32_e32 v6, s26
-; HEURRC-NEXT: v_mov_b32_e32 v7, s27
+; HEURRC-NEXT: v_mov_b32_e32 v20, s24
+; HEURRC-NEXT: v_mov_b32_e32 v21, s25
+; HEURRC-NEXT: v_mov_b32_e32 v22, s26
+; HEURRC-NEXT: v_mov_b32_e32 v23, s27
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
-; HEURRC-NEXT: v_mov_b32_e32 v0, 0
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
+; HEURRC-NEXT: v_mov_b32_e32 v16, 0
; HEURRC-NEXT: s_nop 7
; HEURRC-NEXT: s_nop 2
-; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
@@ -5287,41 +5103,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b32_e32 v4, s24
-; SDAG-NEXT: v_mov_b32_e32 v5, s25
-; SDAG-NEXT: v_mov_b32_e32 v6, s26
-; SDAG-NEXT: v_mov_b32_e32 v7, s27
+; SDAG-NEXT: v_mov_b32_e32 v20, s24
+; SDAG-NEXT: v_mov_b32_e32 v21, s25
+; SDAG-NEXT: v_mov_b32_e32 v22, s26
+; SDAG-NEXT: v_mov_b32_e32 v23, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
@@ -5330,35 +5138,27 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
@@ -5366,41 +5166,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
+; HEURRC-NEXT: v_mov_b32_e32 v16, s20
+; HEURRC-NEXT: v_mov_b32_e32 v17, s21
+; HEURRC-NEXT: v_mov_b32_e32 v18, s22
+; HEURRC-NEXT: v_mov_b32_e32 v19, s23
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b32_e32 v4, s24
-; HEURRC-NEXT: v_mov_b32_e32 v5, s25
-; HEURRC-NEXT: v_mov_b32_e32 v6, s26
-; HEURRC-NEXT: v_mov_b32_e32 v7, s27
+; HEURRC-NEXT: v_mov_b32_e32 v20, s24
+; HEURRC-NEXT: v_mov_b32_e32 v21, s25
+; HEURRC-NEXT: v_mov_b32_e32 v22, s26
+; HEURRC-NEXT: v_mov_b32_e32 v23, s27
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; HEURRC-NEXT: v_mov_b32_e32 v0, 0
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: v_mov_b32_e32 v16, 0
; HEURRC-NEXT: s_nop 7
; HEURRC-NEXT: s_nop 2
-; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
@@ -5651,20 +5443,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GCN-NEXT: v_mov_b32_e32 v8, 0
+; GCN-NEXT: v_mov_b32_e32 v12, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GCN-NEXT: v_accvgpr_write_b32 a0, s0
+; GCN-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GCN-NEXT: v_accvgpr_write_b32 a1, s1
-; GCN-NEXT: v_accvgpr_write_b32 a2, s2
-; GCN-NEXT: v_accvgpr_write_b32 a3, s3
+; GCN-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
; GCN-NEXT: s_nop 7
-; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; GCN-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
@@ -5672,20 +5462,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v12, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
@@ -5755,20 +5543,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GCN-NEXT: v_mov_b32_e32 v8, 0
+; GCN-NEXT: v_mov_b32_e32 v12, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GCN-NEXT: v_accvgpr_write_b32 a0, s0
+; GCN-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GCN-NEXT: v_accvgpr_write_b32 a1, s1
-; GCN-NEXT: v_accvgpr_write_b32 a2, s2
-; GCN-NEXT: v_accvgpr_write_b32 a3, s3
+; GCN-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
; GCN-NEXT: s_nop 7
-; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; GCN-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
@@ -5776,20 +5562,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v12, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
@@ -5853,5 +5637,5 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
ret void
}
-attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
+attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" }
attributes #1 = { "amdgpu-flat-work-group-size"="1,64" }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
index ccee113..856185b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
@@ -1,22 +1,116 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefixes=GCN,GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck --check-prefixes=GCN,GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32, i32, <16 x i32>, i32, i32, i32)
declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32, i32, <4 x i32>, i32, i32, i32)
-; GCN-LABEL: {{^}}test_mfma_i32_32x32x8i8:
-; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
-; GCN-DAG: s_load_dwordx16
-; GFX908-DAG-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX90A-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GCN: v_mfma_i32_32x32x8i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GFX908-COUNT-16: v_accvgpr_read_b32
-; GFX908: global_store_dwordx4
-; GFX90A-NOT: v_accvgpr_read_b32
-; GFX90A-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_i32_32x32x8i8(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_i32_32x32x8i8:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX908-NEXT: v_mov_b32_e32 v0, 1
+; GFX908-NEXT: v_mov_b32_e32 v16, 0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v17, s0
+; GFX908-NEXT: v_mov_b32_e32 v1, s1
+; GFX908-NEXT: v_mov_b32_e32 v2, s2
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v17
+; GFX908-NEXT: v_mov_b32_e32 v17, s3
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v17
+; GFX908-NEXT: v_mov_b32_e32 v1, s4
+; GFX908-NEXT: v_mov_b32_e32 v2, s5
+; GFX908-NEXT: v_mov_b32_e32 v17, s6
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v17
+; GFX908-NEXT: v_mov_b32_e32 v1, s7
+; GFX908-NEXT: v_mov_b32_e32 v2, s8
+; GFX908-NEXT: v_mov_b32_e32 v17, s9
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v17
+; GFX908-NEXT: v_mov_b32_e32 v1, s10
+; GFX908-NEXT: v_mov_b32_e32 v2, s11
+; GFX908-NEXT: v_mov_b32_e32 v17, s12
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v17
+; GFX908-NEXT: v_mov_b32_e32 v1, s13
+; GFX908-NEXT: v_mov_b32_e32 v2, s14
+; GFX908-NEXT: v_mov_b32_e32 v17, s15
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v17
+; GFX908-NEXT: v_mov_b32_e32 v1, 2
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX908-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX908-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX908-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: test_mfma_i32_32x32x8i8:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v0, 1
+; GFX90A-NEXT: v_mov_b32_e32 v1, 2
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX90A-NEXT: s_endpgm
bb:
%in.1 = load <16 x i32>, ptr addrspace(1) %arg
%mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 1, i32 2, <16 x i32> %in.1, i32 1, i32 2, i32 3)
@@ -24,18 +118,55 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_i32_16x16x16i8:
-; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
-; GCN: s_load_dwordx4
-; GFX908-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX90A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GCN: v_mfma_i32_16x16x16i8 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GFX908-COUNT-4: v_accvgpr_read_b32
-; GFX908: global_store_dwordx4
-; GFX90A-NOT: v_accvgpr_read_b32
-; GFX90A: global_store_dwordx4 v{{[0-9]+}}, [[RES]]
define amdgpu_kernel void @test_mfma_i32_16x16x16i8(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_i32_16x16x16i8:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX908-NEXT: v_mov_b32_e32 v0, 1
+; GFX908-NEXT: v_mov_b32_e32 v1, 2
+; GFX908-NEXT: v_mov_b32_e32 v4, 0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v5, s0
+; GFX908-NEXT: v_mov_b32_e32 v2, s1
+; GFX908-NEXT: v_mov_b32_e32 v3, s2
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v5
+; GFX908-NEXT: v_mov_b32_e32 v5, s3
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v5
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_mfma_i32_16x16x16i8 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: test_mfma_i32_16x16x16i8:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v0, 1
+; GFX90A-NEXT: v_mov_b32_e32 v2, 2
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_i32_16x16x16i8 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 2
+; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX90A-NEXT: s_endpgm
bb:
%in.1 = load <4 x i32>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 1, i32 2, <4 x i32> %in.1, i32 1, i32 2, i32 3)
@@ -44,3 +175,5 @@ bb:
}
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index ff305da..78be949 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -3,6 +3,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck -enable-var-scope --check-prefixes=GCN,LIT-SRCC,GFX908,GFX908_A %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A,GFX908_A,GFX90A_42 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942,GFX90A_42 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefix=GFX942-VGPR %s
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
@@ -405,6 +406,63 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v33, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, s16
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, s17
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, s18
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, s19
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, s20
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, s21
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s22
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s23
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s25
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s26
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s27
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s28
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s29
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, s30
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, s31
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s3
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v20, s4
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v21, s5
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v22, s6
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v23, s7
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v24, s8
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v25, s9
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v26, s10
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v27, s11
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v28, s12
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v29, s13
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, s14
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, s15
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35]
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
@@ -618,6 +676,33 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 2.0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 1, i32 2, i32 3)
@@ -719,6 +804,23 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: s_nop 3
; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 3
+; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -934,6 +1036,34 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x2f32:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 2.0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x2_f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float 1.0, float 2.0, <16 x float> %in.1, i32 1, i32 2, i32 3)
@@ -1039,6 +1169,24 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x4f32:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x4_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float 1.0, float 2.0, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -1114,19 +1262,19 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v2
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v0
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v1
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s14
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s15
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s14
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s15
; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s0
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s1
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s0
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s1
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5
; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v6
-; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v2
-; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v3
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s2
-; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s3
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v0
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v1
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s2
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s3
; NOLIT-SRCC-NEXT: s_nop 1
-; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
+; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 7
; NOLIT-SRCC-NEXT: s_nop 1
@@ -1254,19 +1402,19 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v2
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v0
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v1
-; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s14
-; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s15
+; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s14
+; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s15
; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
-; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s0
-; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s1
+; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s0
+; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s1
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5
; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v6
-; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v2
-; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v3
-; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s2
-; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s3
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v0
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v1
+; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s2
+; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s3
; LIT-SRCC-NEXT: s_nop 1
-; LIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
+; LIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 7
; LIT-SRCC-NEXT: s_nop 1
@@ -1330,7 +1478,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; GFX90A-LABEL: test_mfma_f32_32x32x4f16:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[36:37], 0x40
; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[36:37], 0x0
@@ -1345,8 +1493,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NEXT: v_mov_b32_e32 v2, s0
+; GFX90A-NEXT: v_mov_b32_e32 v3, s1
; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20
; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21
; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22
@@ -1371,27 +1519,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13
; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14
; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15
-; GFX90A-NEXT: v_mov_b32_e32 v2, s2
-; GFX90A-NEXT: v_mov_b32_e32 v3, s3
+; GFX90A-NEXT: v_mov_b32_e32 v4, s2
+; GFX90A-NEXT: v_mov_b32_e32 v5, s3
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[4:5], a[0:31] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 2
-; GFX90A-NEXT: global_store_dwordx4 v4, a[24:27], s[36:37] offset:96
-; GFX90A-NEXT: global_store_dwordx4 v4, a[28:31], s[36:37] offset:112
-; GFX90A-NEXT: global_store_dwordx4 v4, a[16:19], s[36:37] offset:64
-; GFX90A-NEXT: global_store_dwordx4 v4, a[20:23], s[36:37] offset:80
-; GFX90A-NEXT: global_store_dwordx4 v4, a[8:11], s[36:37] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v4, a[12:15], s[36:37] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v4, a[0:3], s[36:37]
-; GFX90A-NEXT: global_store_dwordx4 v4, a[4:7], s[36:37] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[36:37] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[36:37] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[36:37] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[36:37] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[36:37] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[36:37] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[36:37]
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[36:37] offset:16
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_32x32x4f16:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx16 s[0:15], s[36:37], 0x40
; GFX942-NEXT: s_load_dwordx16 s[16:31], s[36:37], 0x0
@@ -1406,8 +1554,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; GFX942-NEXT: v_accvgpr_write_b32 a2, s18
; GFX942-NEXT: v_accvgpr_write_b32 a3, s19
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s0
-; GFX942-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NEXT: v_mov_b32_e32 v2, s0
+; GFX942-NEXT: v_mov_b32_e32 v3, s1
; GFX942-NEXT: v_accvgpr_write_b32 a4, s20
; GFX942-NEXT: v_accvgpr_write_b32 a5, s21
; GFX942-NEXT: v_accvgpr_write_b32 a6, s22
@@ -1432,22 +1580,83 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; GFX942-NEXT: v_accvgpr_write_b32 a29, s13
; GFX942-NEXT: v_accvgpr_write_b32 a30, s14
; GFX942-NEXT: v_accvgpr_write_b32 a31, s15
-; GFX942-NEXT: v_mov_b32_e32 v2, s2
-; GFX942-NEXT: v_mov_b32_e32 v3, s3
+; GFX942-NEXT: v_mov_b32_e32 v4, s2
+; GFX942-NEXT: v_mov_b32_e32 v5, s3
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[4:5], a[0:31] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 2
-; GFX942-NEXT: global_store_dwordx4 v4, a[24:27], s[36:37] offset:96
-; GFX942-NEXT: global_store_dwordx4 v4, a[28:31], s[36:37] offset:112
-; GFX942-NEXT: global_store_dwordx4 v4, a[16:19], s[36:37] offset:64
-; GFX942-NEXT: global_store_dwordx4 v4, a[20:23], s[36:37] offset:80
-; GFX942-NEXT: global_store_dwordx4 v4, a[8:11], s[36:37] offset:32
-; GFX942-NEXT: global_store_dwordx4 v4, a[12:15], s[36:37] offset:48
-; GFX942-NEXT: global_store_dwordx4 v4, a[0:3], s[36:37]
-; GFX942-NEXT: global_store_dwordx4 v4, a[4:7], s[36:37] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[36:37] offset:96
+; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[36:37] offset:112
+; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[36:37] offset:64
+; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[36:37] offset:80
+; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[36:37] offset:32
+; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[36:37] offset:48
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[36:37]
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[36:37] offset:16
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x4f16:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[36:37], 0x40
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[16:31], s[36:37], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s3
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[38:39], 0x0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, s16
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, s17
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, s18
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, s19
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, s0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v35, s1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, s20
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, s21
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s22
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s23
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s25
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s26
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s27
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s28
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s29
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, s30
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, s31
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v20, s4
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v21, s5
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v22, s6
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v23, s7
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v24, s8
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v25, s9
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v26, s10
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v27, s11
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v28, s12
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v29, s13
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, s14
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, s15
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v36, s2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v37, s3
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x4_2b_f16 v[0:31], v[34:35], v[36:37], v[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 2
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[36:37] offset:96
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[36:37] offset:112
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[36:37] offset:64
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[36:37] offset:80
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[8:11], s[36:37] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[12:15], s[36:37] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[0:3], s[36:37]
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[4:7], s[36:37] offset:16
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%c.1 = load <4 x half>, ptr addrspace(1) %c
@@ -1676,6 +1885,36 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x4f16:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s20
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s21
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s22
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s23
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x4_4b_f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%c.1 = load <4 x half>, ptr addrspace(1) %c
@@ -1752,46 +1991,66 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr add
; GFX90A-LABEL: test_mfma_f32_4x4x4f16:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s4
-; GFX90A-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NEXT: v_mov_b32_e32 v3, s5
; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8
-; GFX90A-NEXT: v_mov_b32_e32 v2, s6
-; GFX90A-NEXT: v_mov_b32_e32 v3, s7
+; GFX90A-NEXT: v_mov_b32_e32 v4, s6
+; GFX90A-NEXT: v_mov_b32_e32 v5, s7
; GFX90A-NEXT: v_accvgpr_write_b32 a1, s9
; GFX90A-NEXT: v_accvgpr_write_b32 a2, s10
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s11
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f32_4x4x4f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 4
-; GFX90A-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_4x4x4f16:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX942-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s4
-; GFX942-NEXT: v_mov_b32_e32 v1, s5
+; GFX942-NEXT: v_mov_b32_e32 v2, s4
+; GFX942-NEXT: v_mov_b32_e32 v3, s5
; GFX942-NEXT: v_accvgpr_write_b32 a0, s8
-; GFX942-NEXT: v_mov_b32_e32 v2, s6
-; GFX942-NEXT: v_mov_b32_e32 v3, s7
+; GFX942-NEXT: v_mov_b32_e32 v4, s6
+; GFX942-NEXT: v_mov_b32_e32 v5, s7
; GFX942-NEXT: v_accvgpr_write_b32 a1, s9
; GFX942-NEXT: v_accvgpr_write_b32 a2, s10
; GFX942-NEXT: v_accvgpr_write_b32 a3, s11
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_4x4x4_16b_f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_4x4x4_16b_f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 4
-; GFX942-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4f16:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s4
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s5
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s6
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s7
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_4x4x4_16b_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 4
+; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%c.1 = load <4 x half>, ptr addrspace(1) %c
@@ -2021,6 +2280,36 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x8f16:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s20
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s21
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s22
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s23
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%c.1 = load <4 x half>, ptr addrspace(1) %c
@@ -2099,47 +2388,67 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr
; GFX90A-LABEL: test_mfma_f32_16x16x16f16:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s4
-; GFX90A-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NEXT: v_mov_b32_e32 v3, s5
; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8
-; GFX90A-NEXT: v_mov_b32_e32 v2, s6
-; GFX90A-NEXT: v_mov_b32_e32 v3, s7
+; GFX90A-NEXT: v_mov_b32_e32 v4, s6
+; GFX90A-NEXT: v_mov_b32_e32 v5, s7
; GFX90A-NEXT: v_accvgpr_write_b32 a1, s9
; GFX90A-NEXT: v_accvgpr_write_b32 a2, s10
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s11
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 2
-; GFX90A-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_16x16x16f16:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX942-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s4
-; GFX942-NEXT: v_mov_b32_e32 v1, s5
+; GFX942-NEXT: v_mov_b32_e32 v2, s4
+; GFX942-NEXT: v_mov_b32_e32 v3, s5
; GFX942-NEXT: v_accvgpr_write_b32 a0, s8
-; GFX942-NEXT: v_mov_b32_e32 v2, s6
-; GFX942-NEXT: v_mov_b32_e32 v3, s7
+; GFX942-NEXT: v_mov_b32_e32 v4, s6
+; GFX942-NEXT: v_mov_b32_e32 v5, s7
; GFX942-NEXT: v_accvgpr_write_b32 a1, s9
; GFX942-NEXT: v_accvgpr_write_b32 a2, s10
; GFX942-NEXT: v_accvgpr_write_b32 a3, s11
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 6
-; GFX942-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16f16:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s4
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s5
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s6
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s7
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 6
+; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%c.1 = load <4 x half>, ptr addrspace(1) %c
@@ -2508,6 +2817,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_i32_32x32x4i8:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v33, 1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, 2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, s16
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, s17
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, s18
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, s19
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, s20
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, s21
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s22
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s23
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s25
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s26
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s27
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s28
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s29
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, s30
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, s31
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s3
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v20, s4
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v21, s5
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v22, s6
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v23, s7
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v24, s8
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v25, s9
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v26, s10
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v27, s11
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v28, s12
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v29, s13
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, s14
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, s15
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_i32_32x32x4_2b_i8 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 2
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35]
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <32 x i32>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 1, i32 2, <32 x i32> %in.1, i32 1, i32 2, i32 3)
@@ -2721,6 +3087,33 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_i32_16x16x4i8:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 2
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <16 x i32>, ptr addrspace(1) %arg
%mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 1, i32 2, <16 x i32> %in.1, i32 1, i32 2, i32 3)
@@ -2728,6 +3121,143 @@ bb:
ret void
}
+define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspace(1) %arg) #0 {
+; NOLIT-SRCC-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
+; NOLIT-SRCC: ; %bb.0: ; %bb
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, 64
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, 64
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
+; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
+; NOLIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
+; NOLIT-SRCC-NEXT: s_nop 7
+; NOLIT-SRCC-NEXT: s_nop 1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8
+; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; NOLIT-SRCC-NEXT: s_endpgm
+;
+; LIT-SRCC-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
+; LIT-SRCC: ; %bb.0: ; %bb
+; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1
+; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
+; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0
+; LIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
+; LIT-SRCC-NEXT: s_nop 7
+; LIT-SRCC-NEXT: s_nop 1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8
+; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; LIT-SRCC-NEXT: s_endpgm
+;
+; GFX90A-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: v_mov_b32_e32 v0, 1
+; GFX90A-NEXT: v_mov_b32_e32 v1, 2
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
+; GFX942: ; %bb.0: ; %bb
+; GFX942-NEXT: v_mov_b32_e32 v0, 1
+; GFX942-NEXT: v_mov_b32_e32 v1, 2
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
+bb:
+ %in.1 = load <16 x i32>, ptr addrspace(1) %arg
+ %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 1, i32 2, <16 x i32> splat (i32 64), i32 1, i32 2, i32 3)
+ store <16 x i32> %mai.1, ptr addrspace(1) %arg
+ ret void
+}
+
define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 {
; NOLIT-SRCC-LABEL: test_mfma_i32_4x4x4i8:
; NOLIT-SRCC: ; %bb.0: ; %bb
@@ -2822,6 +3352,23 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: s_nop 4
; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 4
+; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <4 x i32>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %in.1, i32 1, i32 2, i32 3)
@@ -2829,6 +3376,197 @@ bb:
ret void
}
+define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_imm_src2_1(ptr addrspace(1) %arg) #0 {
+; NOLIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
+; NOLIT-SRCC: ; %bb.0: ; %bb
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 1
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 1
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 1
+; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
+; NOLIT-SRCC-NEXT: s_nop 0
+; NOLIT-SRCC-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
+; NOLIT-SRCC-NEXT: s_nop 3
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
+; NOLIT-SRCC-NEXT: s_nop 0
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; NOLIT-SRCC-NEXT: s_endpgm
+;
+; LIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
+; LIT-SRCC: ; %bb.0: ; %bb
+; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1
+; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2
+; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
+; LIT-SRCC-NEXT: s_nop 0
+; LIT-SRCC-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, 1 cbsz:1 abid:2 blgp:3
+; LIT-SRCC-NEXT: s_nop 3
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
+; LIT-SRCC-NEXT: s_nop 0
+; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; LIT-SRCC-NEXT: s_endpgm
+;
+; GFX90A-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: v_mov_b32_e32 v0, 1
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v2, 2
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v2, 1 cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_nop 3
+; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
+; GFX942: ; %bb.0: ; %bb
+; GFX942-NEXT: v_mov_b32_e32 v0, 1
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mfma_i32_4x4x4_16b_i8 a[0:3], v0, v2, 1 cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_nop 3
+; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v0, v1, 1 cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_nop 3
+; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
+bb:
+ %in.1 = load <4 x i32>, ptr addrspace(1) %arg
+ %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> splat (i32 1), i32 1, i32 2, i32 3)
+ store <4 x i32> %mai.1, ptr addrspace(1) %arg
+ ret void
+}
+
+define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_k_src2_1(ptr addrspace(1) %arg) #0 {
+; NOLIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
+; NOLIT-SRCC: ; %bb.0:
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 0x41
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 1
+; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v0
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v0
+; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v0
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 2
+; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
+; NOLIT-SRCC-NEXT: s_nop 0
+; NOLIT-SRCC-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v1, v0, a[0:3] cbsz:1 abid:2 blgp:3
+; NOLIT-SRCC-NEXT: s_nop 3
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
+; NOLIT-SRCC-NEXT: s_nop 0
+; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; NOLIT-SRCC-NEXT: s_endpgm
+;
+; LIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
+; LIT-SRCC: ; %bb.0:
+; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 0x41
+; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 1
+; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v0
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v0
+; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v0
+; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 2
+; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0
+; LIT-SRCC-NEXT: s_nop 0
+; LIT-SRCC-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v1, v0, a[0:3] cbsz:1 abid:2 blgp:3
+; LIT-SRCC-NEXT: s_nop 3
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3
+; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0)
+; LIT-SRCC-NEXT: s_nop 0
+; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; LIT-SRCC-NEXT: s_endpgm
+;
+; GFX90A-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, 1
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 2
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v1, v2, a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_nop 3
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: v_mov_b32_e32 v1, 0x41
+; GFX942-NEXT: v_accvgpr_write_b32 a0, v1
+; GFX942-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0
+; GFX942-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mfma_i32_4x4x4_16b_i8 a[0:3], v1, v2, a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_nop 3
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
+; GFX942-VGPR: ; %bb.0:
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 1
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x41
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v5, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_nop 3
+; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
+ %in.1 = load <4 x i32>, ptr addrspace(1) %arg
+ %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> splat (i32 65), i32 1, i32 2, i32 3)
+ store <4 x i32> %mai.1, ptr addrspace(1) %arg
+ ret void
+}
+
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1) %arg) #0 {
; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_forward_acc:
; NOLIT-SRCC: ; %bb.0: ; %bb
@@ -3219,6 +3957,64 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_forward_acc:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v33, 2.0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, s16
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, s17
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, s18
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, s19
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, s20
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, s21
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s22
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s23
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s25
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s26
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s27
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s28
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s29
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, s30
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, s31
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s3
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v20, s4
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v21, s5
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v22, s6
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v23, s7
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v24, s8
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v25, s9
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v26, s10
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v27, s11
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v28, s12
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v29, s13
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, s14
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, s15
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35]
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
@@ -3435,6 +4231,34 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32_forward_acc:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 2.0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15]
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 0, i32 0, i32 0)
@@ -3542,6 +4366,25 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) %
; GFX942-NEXT: s_nop 3
; GFX942-NEXT: global_store_dwordx4 v2, a[0:3], s[6:7]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_forward_acc:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3]
+; GFX942-VGPR-NEXT: s_nop 3
+; GFX942-VGPR-NEXT: global_store_dwordx4 v6, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 0, i32 0, i32 0)
@@ -3616,6 +4459,19 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm_splat(ptr addrspace(1) %ar
; GFX942-NEXT: s_nop 2
; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_imm_splat:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, 1.0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_nop 2
+; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
store <4 x float> %mai.1, ptr addrspace(1) %arg
@@ -3745,6 +4601,22 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32_imm_splat:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v1, 1.0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
store <16 x float> %mai.1, ptr addrspace(1) %arg
@@ -3885,6 +4757,24 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x8f16_imm_splat:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x3c003c00
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, 0x40004000
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[0:1], v[2:3], 1.0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, <4 x half> <half 2.0, half 2.0, half 2.0, half 2.0>, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
store <16 x float> %mai.1, ptr addrspace(1) %arg
@@ -4091,6 +4981,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_imm_splat:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 0
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v0, v1, 0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
store <32 x float> %mai.1, ptr addrspace(1) %arg
@@ -4175,6 +5086,21 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(ptr addrspace(1) %arg) #0
; GFX942-NEXT: s_nop 2
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_imm:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[0:3]
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_nop 2
+; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 2.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
store <4 x float> %mai.1, ptr addrspace(1) %arg
@@ -4355,6 +5281,36 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32_imm:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, v0
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v15, v[0:15]
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 2.0>, i32 0, i32 0, i32 0)
store <16 x float> %mai.1, ptr addrspace(1) %arg
@@ -4667,6 +5623,74 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_imm:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v20, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v21, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v22, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v23, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v24, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v25, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v26, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v27, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v28, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v29, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, v1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, v1
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[32:33], v[30:31]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, 2.0
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[30:31], v[28:29]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[28:29], v[26:27]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[26:27], v[24:25]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[24:25], v[22:23]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[22:23], v[20:21]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[20:21], v[18:19]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[18:19], v[16:17]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[16:17], v[14:15]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], v[12:13]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], v[10:11]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], v[8:9]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[2:33], v0, v34, v[2:33]
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[30:33], s[0:1] offset:112
+; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[26:29], s[0:1] offset:96
+; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[22:25], s[0:1] offset:80
+; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[18:21], s[0:1] offset:64
+; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[14:17], s[0:1] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[10:13], s[0:1] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[6:9], s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
store <32 x float> %mai.1, ptr addrspace(1) %arg
@@ -4755,6 +5779,24 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(ptr addrspace(1) %ar
; GFX942-NEXT: s_nop 2
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 1.0
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-VGPR-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x42f60000
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2.0
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3]
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_nop 2
+; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid
@@ -4846,6 +5888,23 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(ptr addrspa
; GFX942-NEXT: s_nop 2
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 1.0
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x42f60000
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3]
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_nop 2
+; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid
@@ -5109,6 +6168,37 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GFX942-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_vecarg:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-VGPR-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v33, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, 2.0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; GFX942-VGPR-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
+; GFX942-VGPR-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX942-VGPR-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; GFX942-VGPR-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GFX942-VGPR-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
+; GFX942-VGPR-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GFX942-VGPR-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; GFX942-VGPR-NEXT: s_waitcnt vmcnt(0)
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %arg, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
index 04ee0bb..f78ea92 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
@@ -1485,30 +1485,30 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v12, s0
-; SDAG-NEXT: v_mov_b32_e32 v13, s1
-; SDAG-NEXT: v_mov_b32_e32 v14, s2
-; SDAG-NEXT: v_mov_b32_e32 v15, s3
-; SDAG-NEXT: v_mov_b32_e32 v16, s16
-; SDAG-NEXT: v_mov_b32_e32 v17, s17
-; SDAG-NEXT: v_mov_b32_e32 v18, s18
-; SDAG-NEXT: v_mov_b32_e32 v19, s19
-; SDAG-NEXT: v_mov_b32_e32 v20, s28
-; SDAG-NEXT: v_mov_b32_e32 v21, s29
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
-; SDAG-NEXT: v_mov_b32_e32 v8, s24
-; SDAG-NEXT: v_mov_b32_e32 v9, s25
-; SDAG-NEXT: v_mov_b32_e32 v10, s26
-; SDAG-NEXT: v_mov_b32_e32 v11, s27
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v21
+; SDAG-NEXT: v_mov_b32_e32 v14, s0
+; SDAG-NEXT: v_mov_b32_e32 v15, s1
+; SDAG-NEXT: v_mov_b32_e32 v16, s2
+; SDAG-NEXT: v_mov_b32_e32 v17, s3
+; SDAG-NEXT: v_mov_b32_e32 v18, s16
+; SDAG-NEXT: v_mov_b32_e32 v19, s17
+; SDAG-NEXT: v_mov_b32_e32 v20, s18
+; SDAG-NEXT: v_mov_b32_e32 v21, s19
+; SDAG-NEXT: v_mov_b32_e32 v4, s28
+; SDAG-NEXT: v_mov_b32_e32 v5, s29
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s24
+; SDAG-NEXT: v_mov_b32_e32 v11, s25
+; SDAG-NEXT: v_mov_b32_e32 v12, s26
+; SDAG-NEXT: v_mov_b32_e32 v13, s27
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v4
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v5
; SDAG-NEXT: v_accvgpr_write_b32 a2, v0
; SDAG-NEXT: v_accvgpr_write_b32 a3, v1
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[12:19], v[4:11], a[0:3], v2, v3 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[6:13], a[0:3], v2, v3 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1895,7 +1895,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
@@ -1915,16 +1915,16 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
; SDAG-NEXT: v_mov_b32_e32 v14, s22
; SDAG-NEXT: v_mov_b32_e32 v15, s23
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: v_mov_b32_e32 v21, s13
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s12, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s12, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[14:15]
+; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[14:15]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd:
@@ -1937,20 +1937,18 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s24
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s25
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s26
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s27
-; GISEL-NEXT: v_mov_b32_e32 v16, s29
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT: v_mov_b32_e32 v20, s29
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s28, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s28, v20 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[30:31]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1)
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -1964,7 +1962,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
; SDAG-NEXT: s_movk_i32 s6, 0x41
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
@@ -1974,7 +1972,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: v_mov_b32_e32 v5, s13
; SDAG-NEXT: v_mov_b32_e32 v6, s14
; SDAG-NEXT: v_mov_b32_e32 v7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
; SDAG-NEXT: v_mov_b32_e32 v8, s16
; SDAG-NEXT: v_mov_b32_e32 v9, s17
; SDAG-NEXT: v_mov_b32_e32 v10, s18
@@ -1983,21 +1981,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: v_mov_b32_e32 v13, s21
; SDAG-NEXT: v_mov_b32_e32 v14, s22
; SDAG-NEXT: v_mov_b32_e32 v15, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
+; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
-; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v20, 0x41
; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
@@ -2005,19 +2001,17 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2)
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -2031,7 +2025,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
; SDAG-NEXT: s_movk_i32 s6, 0x41
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
@@ -2041,7 +2035,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: v_mov_b32_e32 v5, s13
; SDAG-NEXT: v_mov_b32_e32 v6, s14
; SDAG-NEXT: v_mov_b32_e32 v7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
; SDAG-NEXT: v_mov_b32_e32 v8, s16
; SDAG-NEXT: v_mov_b32_e32 v9, s17
; SDAG-NEXT: v_mov_b32_e32 v10, s18
@@ -2050,21 +2044,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: v_mov_b32_e32 v13, s21
; SDAG-NEXT: v_mov_b32_e32 v14, s22
; SDAG-NEXT: v_mov_b32_e32 v15, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
+; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
-; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v20, 0x41
; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
@@ -2072,19 +2064,17 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 1065353216)
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -2096,7 +2086,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v0, s8
@@ -2107,7 +2097,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: v_mov_b32_e32 v5, s13
; SDAG-NEXT: v_mov_b32_e32 v6, s14
; SDAG-NEXT: v_mov_b32_e32 v7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
; SDAG-NEXT: v_mov_b32_e32 v8, s16
; SDAG-NEXT: v_mov_b32_e32 v9, s17
; SDAG-NEXT: v_mov_b32_e32 v10, s18
@@ -2116,14 +2106,12 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: v_mov_b32_e32 v13, s21
; SDAG-NEXT: v_mov_b32_e32 v14, s22
; SDAG-NEXT: v_mov_b32_e32 v15, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
+; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm:
@@ -2136,21 +2124,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 -2)
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -2162,7 +2148,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v0, s8
@@ -2173,7 +2159,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: v_mov_b32_e32 v5, s13
; SDAG-NEXT: v_mov_b32_e32 v6, s14
; SDAG-NEXT: v_mov_b32_e32 v7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
; SDAG-NEXT: v_mov_b32_e32 v8, s16
; SDAG-NEXT: v_mov_b32_e32 v9, s17
; SDAG-NEXT: v_mov_b32_e32 v10, s18
@@ -2182,14 +2168,12 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: v_mov_b32_e32 v13, s21
; SDAG-NEXT: v_mov_b32_e32 v14, s22
; SDAG-NEXT: v_mov_b32_e32 v15, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
+; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal:
@@ -2202,21 +2186,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 1042479491)
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -2559,5 +2541,5 @@ declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6
declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
-attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
+attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" }
attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index 91197f9..0b2818f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -3515,26 +3515,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v24, s0
-; SDAG-NEXT: v_mov_b32_e32 v25, s1
-; SDAG-NEXT: v_mov_b32_e32 v26, s2
-; SDAG-NEXT: v_mov_b32_e32 v27, s3
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
-; SDAG-NEXT: v_mov_b32_e32 v29, s17
-; SDAG-NEXT: v_mov_b32_e32 v30, s18
-; SDAG-NEXT: v_mov_b32_e32 v31, s19
-; SDAG-NEXT: v_mov_b32_e32 v32, s28
-; SDAG-NEXT: v_mov_b32_e32 v33, s29
-; SDAG-NEXT: v_mov_b32_e32 v16, s20
-; SDAG-NEXT: v_mov_b32_e32 v17, s21
-; SDAG-NEXT: v_mov_b32_e32 v18, s22
-; SDAG-NEXT: v_mov_b32_e32 v19, s23
-; SDAG-NEXT: v_mov_b32_e32 v20, s24
-; SDAG-NEXT: v_mov_b32_e32 v21, s25
-; SDAG-NEXT: v_mov_b32_e32 v22, s26
-; SDAG-NEXT: v_mov_b32_e32 v23, s27
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v32
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v33
+; SDAG-NEXT: v_mov_b32_e32 v26, s0
+; SDAG-NEXT: v_mov_b32_e32 v27, s1
+; SDAG-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-NEXT: v_mov_b32_e32 v29, s3
+; SDAG-NEXT: v_mov_b32_e32 v30, s16
+; SDAG-NEXT: v_mov_b32_e32 v31, s17
+; SDAG-NEXT: v_mov_b32_e32 v32, s18
+; SDAG-NEXT: v_mov_b32_e32 v33, s19
+; SDAG-NEXT: v_mov_b32_e32 v16, s28
+; SDAG-NEXT: v_mov_b32_e32 v17, s29
+; SDAG-NEXT: v_mov_b32_e32 v18, s20
+; SDAG-NEXT: v_mov_b32_e32 v19, s21
+; SDAG-NEXT: v_mov_b32_e32 v20, s22
+; SDAG-NEXT: v_mov_b32_e32 v21, s23
+; SDAG-NEXT: v_mov_b32_e32 v22, s24
+; SDAG-NEXT: v_mov_b32_e32 v23, s25
+; SDAG-NEXT: v_mov_b32_e32 v24, s26
+; SDAG-NEXT: v_mov_b32_e32 v25, s27
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
; SDAG-NEXT: v_accvgpr_write_b32 a2, v0
; SDAG-NEXT: v_accvgpr_write_b32 a3, v1
; SDAG-NEXT: v_accvgpr_write_b32 a4, v2
@@ -3550,7 +3550,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
; SDAG-NEXT: v_accvgpr_write_b32 a14, v12
; SDAG-NEXT: v_accvgpr_write_b32 a15, v13
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[24:31], v[16:23], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[18:25], a[0:15], v14, v15 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
@@ -3993,34 +3993,34 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v16, s0
-; SDAG-NEXT: v_mov_b32_e32 v17, s1
-; SDAG-NEXT: v_mov_b32_e32 v18, s2
-; SDAG-NEXT: v_mov_b32_e32 v19, s3
-; SDAG-NEXT: v_mov_b32_e32 v20, s16
-; SDAG-NEXT: v_mov_b32_e32 v21, s17
-; SDAG-NEXT: v_mov_b32_e32 v22, s18
-; SDAG-NEXT: v_mov_b32_e32 v23, s19
-; SDAG-NEXT: v_mov_b32_e32 v24, s20
-; SDAG-NEXT: v_mov_b32_e32 v25, s21
-; SDAG-NEXT: v_mov_b32_e32 v26, s22
-; SDAG-NEXT: v_mov_b32_e32 v27, s23
-; SDAG-NEXT: v_mov_b32_e32 v28, s24
-; SDAG-NEXT: v_mov_b32_e32 v29, s25
-; SDAG-NEXT: v_mov_b32_e32 v30, s26
-; SDAG-NEXT: v_mov_b32_e32 v31, s27
-; SDAG-NEXT: v_mov_b32_e32 v32, s28
-; SDAG-NEXT: v_mov_b32_e32 v33, s29
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v31
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v32
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v33
+; SDAG-NEXT: v_mov_b32_e32 v26, s0
+; SDAG-NEXT: v_mov_b32_e32 v27, s1
+; SDAG-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-NEXT: v_mov_b32_e32 v29, s3
+; SDAG-NEXT: v_mov_b32_e32 v30, s16
+; SDAG-NEXT: v_mov_b32_e32 v31, s17
+; SDAG-NEXT: v_mov_b32_e32 v32, s18
+; SDAG-NEXT: v_mov_b32_e32 v33, s19
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
+; SDAG-NEXT: v_mov_b32_e32 v20, s24
+; SDAG-NEXT: v_mov_b32_e32 v21, s25
+; SDAG-NEXT: v_mov_b32_e32 v22, s26
+; SDAG-NEXT: v_mov_b32_e32 v23, s27
+; SDAG-NEXT: v_mov_b32_e32 v24, s28
+; SDAG-NEXT: v_mov_b32_e32 v25, s29
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
; SDAG-NEXT: v_accvgpr_write_b32 a10, v8
; SDAG-NEXT: v_accvgpr_write_b32 a11, v9
; SDAG-NEXT: v_accvgpr_write_b32 a12, v10
@@ -4028,7 +4028,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
; SDAG-NEXT: v_accvgpr_write_b32 a14, v12
; SDAG-NEXT: v_accvgpr_write_b32 a15, v13
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
@@ -4539,49 +4539,41 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32>
; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s36
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: v_mov_b32_e32 v4, s12
-; SDAG-NEXT: v_mov_b32_e32 v5, s13
-; SDAG-NEXT: v_mov_b32_e32 v6, s14
-; SDAG-NEXT: v_mov_b32_e32 v7, s15
-; SDAG-NEXT: v_mov_b32_e32 v8, s16
-; SDAG-NEXT: v_mov_b32_e32 v9, s17
-; SDAG-NEXT: v_mov_b32_e32 v10, s18
-; SDAG-NEXT: v_mov_b32_e32 v11, s19
-; SDAG-NEXT: v_mov_b32_e32 v12, s20
-; SDAG-NEXT: v_mov_b32_e32 v13, s21
-; SDAG-NEXT: v_mov_b32_e32 v14, s22
-; SDAG-NEXT: v_mov_b32_e32 v15, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s37
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s38
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s39
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s40
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s41
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s42
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s43
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s44
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s45
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s46
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s47
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s48
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s49
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s50
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s51
-; SDAG-NEXT: v_mov_b32_e32 v16, s1
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: v_mov_b32_e32 v20, s12
+; SDAG-NEXT: v_mov_b32_e32 v21, s13
+; SDAG-NEXT: v_mov_b32_e32 v22, s14
+; SDAG-NEXT: v_mov_b32_e32 v23, s15
+; SDAG-NEXT: v_mov_b32_e32 v24, s16
+; SDAG-NEXT: v_mov_b32_e32 v25, s17
+; SDAG-NEXT: v_mov_b32_e32 v26, s18
+; SDAG-NEXT: v_mov_b32_e32 v27, s19
+; SDAG-NEXT: v_mov_b32_e32 v28, s20
+; SDAG-NEXT: v_mov_b32_e32 v29, s21
+; SDAG-NEXT: v_mov_b32_e32 v30, s22
+; SDAG-NEXT: v_mov_b32_e32 v31, s23
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; SDAG-NEXT: v_mov_b32_e32 v32, s1
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s0, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48
-; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32
-; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3]
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd:
@@ -4590,41 +4582,33 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32>
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s36
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s37
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s38
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s39
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s40
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s41
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s42
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s43
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s44
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s45
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s46
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s47
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s48
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s49
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s50
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s51
-; GISEL-NEXT: v_mov_b32_e32 v16, s1
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; GISEL-NEXT: v_mov_b32_e32 v32, s1
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s0, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3]
-; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16
-; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32
-; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
; GISEL-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1)
store <16 x float> %result, ptr addrspace(1) %ptr, align 64
@@ -4639,91 +4623,75 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_
; SDAG-NEXT: s_movk_i32 s2, 0x41
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: v_mov_b32_e32 v4, s12
-; SDAG-NEXT: v_mov_b32_e32 v5, s13
-; SDAG-NEXT: v_mov_b32_e32 v6, s14
-; SDAG-NEXT: v_mov_b32_e32 v7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s36
-; SDAG-NEXT: v_mov_b32_e32 v8, s16
-; SDAG-NEXT: v_mov_b32_e32 v9, s17
-; SDAG-NEXT: v_mov_b32_e32 v10, s18
-; SDAG-NEXT: v_mov_b32_e32 v11, s19
-; SDAG-NEXT: v_mov_b32_e32 v12, s20
-; SDAG-NEXT: v_mov_b32_e32 v13, s21
-; SDAG-NEXT: v_mov_b32_e32 v14, s22
-; SDAG-NEXT: v_mov_b32_e32 v15, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s37
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s38
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s39
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s40
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s41
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s42
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s43
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s44
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s45
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s46
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s47
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s48
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s49
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s50
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s51
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: v_mov_b32_e32 v20, s12
+; SDAG-NEXT: v_mov_b32_e32 v21, s13
+; SDAG-NEXT: v_mov_b32_e32 v22, s14
+; SDAG-NEXT: v_mov_b32_e32 v23, s15
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
+; SDAG-NEXT: v_mov_b32_e32 v24, s16
+; SDAG-NEXT: v_mov_b32_e32 v25, s17
+; SDAG-NEXT: v_mov_b32_e32 v26, s18
+; SDAG-NEXT: v_mov_b32_e32 v27, s19
+; SDAG-NEXT: v_mov_b32_e32 v28, s20
+; SDAG-NEXT: v_mov_b32_e32 v29, s21
+; SDAG-NEXT: v_mov_b32_e32 v30, s22
+; SDAG-NEXT: v_mov_b32_e32 v31, s23
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s2, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s2, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
-; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v32, 0x41
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s36
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s37
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s38
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s39
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s40
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s41
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s42
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s43
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s44
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s45
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s46
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s47
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s48
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s49
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s50
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s51
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 65, i32 1, i32 -2)
store <16 x float> %result, ptr addrspace(1) %ptr, align 64
@@ -4735,26 +4703,26 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: v_mov_b32_e32 v4, s16
-; SDAG-NEXT: v_mov_b32_e32 v5, s17
-; SDAG-NEXT: v_mov_b32_e32 v6, s18
-; SDAG-NEXT: v_mov_b32_e32 v7, s19
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
-; SDAG-NEXT: v_mov_b32_e32 v9, s21
-; SDAG-NEXT: v_mov_b32_e32 v10, s22
-; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: v_mov_b32_e32 v2, s12
+; SDAG-NEXT: v_mov_b32_e32 v3, s13
+; SDAG-NEXT: v_mov_b32_e32 v4, s14
+; SDAG-NEXT: v_mov_b32_e32 v5, s15
+; SDAG-NEXT: v_mov_b32_e32 v6, s16
+; SDAG-NEXT: v_mov_b32_e32 v7, s17
+; SDAG-NEXT: v_mov_b32_e32 v8, s18
+; SDAG-NEXT: v_mov_b32_e32 v9, s19
+; SDAG-NEXT: v_mov_b32_e32 v10, s20
+; SDAG-NEXT: v_mov_b32_e32 v11, s21
+; SDAG-NEXT: v_mov_b32_e32 v12, s22
+; SDAG-NEXT: v_mov_b32_e32 v13, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
-; SDAG-NEXT: v_mov_b32_e32 v13, s25
-; SDAG-NEXT: v_mov_b32_e32 v14, s26
+; SDAG-NEXT: v_mov_b32_e32 v14, s24
+; SDAG-NEXT: v_mov_b32_e32 v15, s25
+; SDAG-NEXT: v_mov_b32_e32 v16, s26
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b32_e32 v15, s27
+; SDAG-NEXT: v_mov_b32_e32 v17, s27
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
@@ -4770,45 +4738,44 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: v_mov_b32_e32 v16, s1
+; SDAG-NEXT: v_mov_b32_e32 v0, s1
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0]
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48
-; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], s0, v0 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mov_b32_e32 v2, s20
+; SDAG-NEXT: v_mov_b32_e32 v3, s21
+; SDAG-NEXT: v_mov_b32_e32 v4, s22
+; SDAG-NEXT: v_mov_b32_e32 v5, s23
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v6, s18
+; SDAG-NEXT: v_mov_b32_e32 v7, s19
+; SDAG-NEXT: v_mov_b32_e32 v4, s16
+; SDAG-NEXT: v_mov_b32_e32 v5, s17
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v10, s10
+; SDAG-NEXT: v_mov_b32_e32 v11, s11
+; SDAG-NEXT: v_mov_b32_e32 v8, s8
+; SDAG-NEXT: v_mov_b32_e32 v9, s9
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -4922,42 +4889,41 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48
-; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s20
+; SDAG-NEXT: v_mov_b32_e32 v3, s21
+; SDAG-NEXT: v_mov_b32_e32 v4, s22
+; SDAG-NEXT: v_mov_b32_e32 v5, s23
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v6, s18
+; SDAG-NEXT: v_mov_b32_e32 v7, s19
+; SDAG-NEXT: v_mov_b32_e32 v4, s16
+; SDAG-NEXT: v_mov_b32_e32 v5, s17
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v8, s14
+; SDAG-NEXT: v_mov_b32_e32 v9, s15
+; SDAG-NEXT: v_mov_b32_e32 v6, s12
+; SDAG-NEXT: v_mov_b32_e32 v7, s13
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v10, s10
+; SDAG-NEXT: v_mov_b32_e32 v11, s11
+; SDAG-NEXT: v_mov_b32_e32 v8, s8
+; SDAG-NEXT: v_mov_b32_e32 v9, s9
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -5033,78 +4999,72 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: v_mov_b32_e32 v4, s16
-; SDAG-NEXT: v_mov_b32_e32 v5, s17
-; SDAG-NEXT: v_mov_b32_e32 v6, s18
-; SDAG-NEXT: v_mov_b32_e32 v7, s19
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
-; SDAG-NEXT: v_mov_b32_e32 v9, s21
-; SDAG-NEXT: v_mov_b32_e32 v10, s22
-; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: v_mov_b32_e32 v32, s12
+; SDAG-NEXT: v_mov_b32_e32 v33, s13
+; SDAG-NEXT: v_mov_b32_e32 v34, s14
+; SDAG-NEXT: v_mov_b32_e32 v35, s15
+; SDAG-NEXT: v_mov_b32_e32 v36, s16
+; SDAG-NEXT: v_mov_b32_e32 v37, s17
+; SDAG-NEXT: v_mov_b32_e32 v38, s18
+; SDAG-NEXT: v_mov_b32_e32 v39, s19
+; SDAG-NEXT: v_mov_b32_e32 v40, s20
+; SDAG-NEXT: v_mov_b32_e32 v41, s21
+; SDAG-NEXT: v_mov_b32_e32 v42, s22
+; SDAG-NEXT: v_mov_b32_e32 v43, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
-; SDAG-NEXT: v_mov_b32_e32 v13, s25
-; SDAG-NEXT: v_mov_b32_e32 v14, s26
-; SDAG-NEXT: v_mov_b32_e32 v15, s27
+; SDAG-NEXT: v_mov_b32_e32 v44, s24
+; SDAG-NEXT: v_mov_b32_e32 v45, s25
+; SDAG-NEXT: v_mov_b32_e32 v46, s26
+; SDAG-NEXT: v_mov_b32_e32 v47, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a29, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a28, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a27, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a26, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a25, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a24, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a23, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a22, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a21, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a20, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a19, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
+; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[16:31] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48
-; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: s_nop 6
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48
+; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -5112,61 +5072,45 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
-; GISEL-NEXT: v_accvgpr_write_b32 a31, s23
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
-; GISEL-NEXT: v_accvgpr_write_b32 a30, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a29, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a28, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a27, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a26, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a25, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a24, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a23, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a22, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a21, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a20, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a19, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a18, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a17, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a16, s8
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[16:31] blgp:2
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48
+; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0)
@@ -5180,78 +5124,70 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: v_mov_b32_e32 v4, s16
-; SDAG-NEXT: v_mov_b32_e32 v5, s17
-; SDAG-NEXT: v_mov_b32_e32 v6, s18
-; SDAG-NEXT: v_mov_b32_e32 v7, s19
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
-; SDAG-NEXT: v_mov_b32_e32 v9, s21
-; SDAG-NEXT: v_mov_b32_e32 v10, s22
-; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: v_mov_b32_e32 v20, s16
+; SDAG-NEXT: v_mov_b32_e32 v21, s17
+; SDAG-NEXT: v_mov_b32_e32 v22, s18
+; SDAG-NEXT: v_mov_b32_e32 v23, s19
+; SDAG-NEXT: v_mov_b32_e32 v24, s20
+; SDAG-NEXT: v_mov_b32_e32 v25, s21
+; SDAG-NEXT: v_mov_b32_e32 v26, s22
+; SDAG-NEXT: v_mov_b32_e32 v27, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
-; SDAG-NEXT: v_mov_b32_e32 v13, s25
-; SDAG-NEXT: v_mov_b32_e32 v14, s26
-; SDAG-NEXT: v_mov_b32_e32 v15, s27
+; SDAG-NEXT: v_mov_b32_e32 v28, s24
+; SDAG-NEXT: v_mov_b32_e32 v29, s25
+; SDAG-NEXT: v_mov_b32_e32 v30, s26
+; SDAG-NEXT: v_mov_b32_e32 v31, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48
-; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48
+; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -5259,61 +5195,53 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[44:45]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[50:51]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42)
@@ -6302,6 +6230,6 @@ declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6
declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
-attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
+attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" }
attributes #1 = { "amdgpu-flat-work-group-size"="128,128" }
attributes #2 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
index 7193fee..31a48de 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
@@ -1,22 +1,54 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -global-isel < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 -global-isel < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-GISEL %s
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float>, <2 x float>, <4 x float>, i32, i32, i32)
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float>, <2 x float>, <16 x float>, i32, i32, i32)
-; GCN-LABEL: {{^}}test_mfma_f32_16x16x8xf32:
-; GFX942-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1.0
-; GFX942-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2.0
-; GFX942-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 0x40400000
-; GFX942-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4.0
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX942: v_mfma_f32_16x16x8_xf32 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:[[TWO]]], v[[[THREE]]:[[FOUR]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GISEL: v_mfma_f32_16x16x8_xf32 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT: v_accvgpr_read_b32
-; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(ptr addrspace(1) %arg) #0 {
+; GFX942-SDAG-LABEL: test_mfma_f32_16x16x8xf32:
+; GFX942-SDAG: ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, 2.0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0x40400000
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT: s_nop 1
+; GFX942-SDAG-NEXT: v_mfma_f32_16x16x8_xf32 a[0:3], v[4:5], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT: s_nop 6
+; GFX942-SDAG-NEXT: global_store_dwordx4 v2, a[0:3], s[6:7]
+; GFX942-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: test_mfma_f32_16x16x8xf32:
+; GFX942-GISEL: ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-GISEL-NEXT: s_mov_b32 s4, 1.0
+; GFX942-GISEL-NEXT: s_mov_b32 s5, 2.0
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT: s_mov_b32 s4, 0x40400000
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-GISEL-NEXT: s_mov_b32 s5, 4.0
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_mfma_f32_16x16x8_xf32 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT: s_nop 5
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -24,17 +56,82 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x4xf32:
-; GFX942-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1.0
-; GFX942-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2.0
-; GFX942-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 0x40400000
-; GFX942-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4.0
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX942: v_mfma_f32_32x32x4_xf32 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:[[TWO]]], v[[[THREE]]:[[FOUR]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GISEL: v_mfma_f32_32x32x4_xf32 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT: v_accvgpr_read_b32
-; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 {
+; GFX942-SDAG-LABEL: test_mfma_f32_32x32x4xf32:
+; GFX942-SDAG: ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 1.0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 2.0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0x40400000
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-SDAG-NEXT: s_nop 1
+; GFX942-SDAG-NEXT: v_mfma_f32_32x32x4_xf32 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: s_nop 7
+; GFX942-SDAG-NEXT: s_nop 1
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: test_mfma_f32_32x32x4xf32:
+; GFX942-GISEL: ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-GISEL-NEXT: s_mov_b32 s18, 1.0
+; GFX942-GISEL-NEXT: s_mov_b32 s19, 2.0
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[18:19]
+; GFX942-GISEL-NEXT: s_mov_b32 s18, 0x40400000
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-GISEL-NEXT: s_mov_b32 s19, 4.0
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a4, s4
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a5, s5
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a6, s6
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a8, s8
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a9, s9
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a10, s10
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a11, s11
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a12, s12
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a13, s13
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a14, s14
+; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_mfma_f32_32x32x4_xf32 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT: s_nop 7
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <16 x float> %in.1, i32 1, i32 2, i32 3)
@@ -43,3 +140,5 @@ bb:
}
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX942: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 77d4aad..b25fe83 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -44,23 +44,23 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_mov_b32_e32 v16, s16
+; GISEL-NEXT: v_mov_b32_e32 v12, s16
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[6:7]
+; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[6:7]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -120,25 +120,25 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__sgpr(<8 x half> inreg %arg0, <
; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s0
-; SDAG-NEXT: v_mov_b32_e32 v9, s1
-; SDAG-NEXT: v_mov_b32_e32 v10, s2
-; SDAG-NEXT: v_mov_b32_e32 v11, s3
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s0
+; SDAG-NEXT: v_mov_b32_e32 v11, s1
+; SDAG-NEXT: v_mov_b32_e32 v12, s2
+; SDAG-NEXT: v_mov_b32_e32 v13, s3
+; SDAG-NEXT: v_mov_b32_e32 v2, s16
+; SDAG-NEXT: v_mov_b32_e32 v3, s17
+; SDAG-NEXT: v_mov_b32_e32 v4, s18
+; SDAG-NEXT: v_mov_b32_e32 v5, s19
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v0, s28
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[10:13], v[2:9], v0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -624,25 +624,25 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__sgpr(<8 x bfloat> inreg %arg0
; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__sgpr:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v8, s0
-; GCN-NEXT: v_mov_b32_e32 v9, s1
-; GCN-NEXT: v_mov_b32_e32 v10, s2
-; GCN-NEXT: v_mov_b32_e32 v11, s3
-; GCN-NEXT: v_mov_b32_e32 v0, s16
-; GCN-NEXT: v_mov_b32_e32 v1, s17
-; GCN-NEXT: v_mov_b32_e32 v2, s18
-; GCN-NEXT: v_mov_b32_e32 v3, s19
-; GCN-NEXT: v_mov_b32_e32 v4, s20
-; GCN-NEXT: v_mov_b32_e32 v5, s21
-; GCN-NEXT: v_mov_b32_e32 v6, s22
-; GCN-NEXT: v_mov_b32_e32 v7, s23
+; GCN-NEXT: v_mov_b32_e32 v10, s0
+; GCN-NEXT: v_mov_b32_e32 v11, s1
+; GCN-NEXT: v_mov_b32_e32 v12, s2
+; GCN-NEXT: v_mov_b32_e32 v13, s3
+; GCN-NEXT: v_mov_b32_e32 v2, s16
+; GCN-NEXT: v_mov_b32_e32 v3, s17
+; GCN-NEXT: v_mov_b32_e32 v4, s18
+; GCN-NEXT: v_mov_b32_e32 v5, s19
+; GCN-NEXT: v_mov_b32_e32 v6, s20
+; GCN-NEXT: v_mov_b32_e32 v7, s21
+; GCN-NEXT: v_mov_b32_e32 v8, s22
+; GCN-NEXT: v_mov_b32_e32 v9, s23
; GCN-NEXT: v_accvgpr_write_b32 a0, s24
; GCN-NEXT: v_accvgpr_write_b32 a1, s25
; GCN-NEXT: v_accvgpr_write_b32 a2, s26
; GCN-NEXT: v_accvgpr_write_b32 a3, s27
-; GCN-NEXT: v_mov_b32_e32 v12, s28
+; GCN-NEXT: v_mov_b32_e32 v0, s28
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[8:11], v[0:7], v12
+; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[10:13], v[2:9], v0
; GCN-NEXT: s_nop 7
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
@@ -887,24 +887,24 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v16, s2
+; GISEL-NEXT: v_mov_b32_e32 v12, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -964,25 +964,25 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__sgpr(<4 x i32> inreg %arg0, <8 x
; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s0
-; SDAG-NEXT: v_mov_b32_e32 v9, s1
-; SDAG-NEXT: v_mov_b32_e32 v10, s2
-; SDAG-NEXT: v_mov_b32_e32 v11, s3
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s0
+; SDAG-NEXT: v_mov_b32_e32 v11, s1
+; SDAG-NEXT: v_mov_b32_e32 v12, s2
+; SDAG-NEXT: v_mov_b32_e32 v13, s3
+; SDAG-NEXT: v_mov_b32_e32 v2, s16
+; SDAG-NEXT: v_mov_b32_e32 v3, s17
+; SDAG-NEXT: v_mov_b32_e32 v4, s18
+; SDAG-NEXT: v_mov_b32_e32 v5, s19
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v0, s28
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[10:13], v[2:9], v0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -1429,24 +1429,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v16, s2
+; GISEL-NEXT: v_mov_b32_e32 v12, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1506,25 +1506,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg
; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s0
-; SDAG-NEXT: v_mov_b32_e32 v9, s1
-; SDAG-NEXT: v_mov_b32_e32 v10, s2
-; SDAG-NEXT: v_mov_b32_e32 v11, s3
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s0
+; SDAG-NEXT: v_mov_b32_e32 v11, s1
+; SDAG-NEXT: v_mov_b32_e32 v12, s2
+; SDAG-NEXT: v_mov_b32_e32 v13, s3
+; SDAG-NEXT: v_mov_b32_e32 v2, s16
+; SDAG-NEXT: v_mov_b32_e32 v3, s17
+; SDAG-NEXT: v_mov_b32_e32 v4, s18
+; SDAG-NEXT: v_mov_b32_e32 v5, s19
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v0, s28
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[10:13], v[2:9], v0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -1598,24 +1598,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v16, s2
+; GISEL-NEXT: v_mov_b32_e32 v12, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1675,25 +1675,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg
; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s0
-; SDAG-NEXT: v_mov_b32_e32 v9, s1
-; SDAG-NEXT: v_mov_b32_e32 v10, s2
-; SDAG-NEXT: v_mov_b32_e32 v11, s3
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s0
+; SDAG-NEXT: v_mov_b32_e32 v11, s1
+; SDAG-NEXT: v_mov_b32_e32 v12, s2
+; SDAG-NEXT: v_mov_b32_e32 v13, s3
+; SDAG-NEXT: v_mov_b32_e32 v2, s16
+; SDAG-NEXT: v_mov_b32_e32 v3, s17
+; SDAG-NEXT: v_mov_b32_e32 v4, s18
+; SDAG-NEXT: v_mov_b32_e32 v5, s19
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v0, s28
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[10:13], v[2:9], v0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -1767,24 +1767,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v16, s2
+; GISEL-NEXT: v_mov_b32_e32 v12, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1844,25 +1844,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg
; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s0
-; SDAG-NEXT: v_mov_b32_e32 v9, s1
-; SDAG-NEXT: v_mov_b32_e32 v10, s2
-; SDAG-NEXT: v_mov_b32_e32 v11, s3
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s0
+; SDAG-NEXT: v_mov_b32_e32 v11, s1
+; SDAG-NEXT: v_mov_b32_e32 v12, s2
+; SDAG-NEXT: v_mov_b32_e32 v13, s3
+; SDAG-NEXT: v_mov_b32_e32 v2, s16
+; SDAG-NEXT: v_mov_b32_e32 v3, s17
+; SDAG-NEXT: v_mov_b32_e32 v4, s18
+; SDAG-NEXT: v_mov_b32_e32 v5, s19
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v0, s28
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[10:13], v[2:9], v0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -1936,24 +1936,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v16, s2
+; GISEL-NEXT: v_mov_b32_e32 v12, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -2013,25 +2013,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__sgpr(<4 x i32> inreg %arg
; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s0
-; SDAG-NEXT: v_mov_b32_e32 v9, s1
-; SDAG-NEXT: v_mov_b32_e32 v10, s2
-; SDAG-NEXT: v_mov_b32_e32 v11, s3
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s20
-; SDAG-NEXT: v_mov_b32_e32 v5, s21
-; SDAG-NEXT: v_mov_b32_e32 v6, s22
-; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_mov_b32_e32 v10, s0
+; SDAG-NEXT: v_mov_b32_e32 v11, s1
+; SDAG-NEXT: v_mov_b32_e32 v12, s2
+; SDAG-NEXT: v_mov_b32_e32 v13, s3
+; SDAG-NEXT: v_mov_b32_e32 v2, s16
+; SDAG-NEXT: v_mov_b32_e32 v3, s17
+; SDAG-NEXT: v_mov_b32_e32 v4, s18
+; SDAG-NEXT: v_mov_b32_e32 v5, s19
+; SDAG-NEXT: v_mov_b32_e32 v6, s20
+; SDAG-NEXT: v_mov_b32_e32 v7, s21
+; SDAG-NEXT: v_mov_b32_e32 v8, s22
+; SDAG-NEXT: v_mov_b32_e32 v9, s23
; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT: v_mov_b32_e32 v12, s28
+; SDAG-NEXT: v_mov_b32_e32 v0, s28
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[10:13], v[2:9], v0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
@@ -3552,4 +3552,4 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
ret <16 x float> %result
}
-attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
+attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-agpr-alloc"="0,0" }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
index 429b3b8..6e24a6a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
@@ -1,36 +1,54 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GFX1250,GL2-ONLY %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+safe-smem-prefetch < %s | FileCheck --check-prefixes=GCN,SPREFETCH,GFX1250-SPREFETCH,GFX1250-SPREFETCH-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+safe-cu-prefetch < %s | FileCheck --check-prefixes=GCN,GFX1250,SAFE-CU %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GCN,NOSPREFETCH %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+safe-smem-prefetch < %s | FileCheck --check-prefixes=GCN,SPREFETCH,SPREFETCH-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+safe-smem-prefetch < %s | FileCheck --check-prefixes=GCN,SPREFETCH,GFX12-SPREFETCH,SPREFETCH-SDAG %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GCN,NOSPREFETCH %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GFX1250,GL2-ONLY %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+safe-smem-prefetch < %s | FileCheck --check-prefixes=GCN,SPREFETCH,GFX1250-SPREFETCH,GFX1250-SPREFETCH-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+safe-cu-prefetch < %s | FileCheck --check-prefixes=GCN,GFX1250,SAFE-CU %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GCN,NOSPREFETCH %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+safe-smem-prefetch < %s | FileCheck --check-prefixes=GCN,SPREFETCH,SPREFETCH-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+safe-smem-prefetch < %s | FileCheck --check-prefixes=GCN,SPREFETCH,GFX12-SPREFETCH,SPREFETCH-GISEL %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GCN,NOSPREFETCH %s
; Scalar data prefetch
define amdgpu_ps void @prefetch_data_sgpr(ptr addrspace(4) inreg %ptr) {
-; NOSPREFETCH-LABEL: prefetch_data_sgpr:
-; NOSPREFETCH: ; %bb.0: ; %entry
-; NOSPREFETCH-NEXT: s_endpgm
+; GFX1250-LABEL: prefetch_data_sgpr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
;
; SPREFETCH-LABEL: prefetch_data_sgpr:
; SPREFETCH: ; %bb.0: ; %entry
; SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
; SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_sgpr:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
entry:
tail call void @llvm.prefetch.p4(ptr addrspace(4) %ptr, i32 0, i32 0, i32 1)
ret void
}
define amdgpu_ps void @prefetch_data_sgpr_offset(ptr addrspace(4) inreg %ptr) {
-; NOSPREFETCH-LABEL: prefetch_data_sgpr_offset:
-; NOSPREFETCH: ; %bb.0: ; %entry
-; NOSPREFETCH-NEXT: s_endpgm
+; GFX1250-LABEL: prefetch_data_sgpr_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] offset:512 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
;
; SPREFETCH-LABEL: prefetch_data_sgpr_offset:
; SPREFETCH: ; %bb.0: ; %entry
; SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x200, null, 0
; SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_sgpr_offset:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
entry:
%gep = getelementptr float, ptr addrspace(4) %ptr, i32 128
tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
@@ -40,14 +58,20 @@ entry:
; Check large offsets
define amdgpu_ps void @prefetch_data_sgpr_max_offset(ptr addrspace(4) inreg %ptr) {
-; NOSPREFETCH-LABEL: prefetch_data_sgpr_max_offset:
-; NOSPREFETCH: ; %bb.0: ; %entry
-; NOSPREFETCH-NEXT: s_endpgm
+; GFX1250-LABEL: prefetch_data_sgpr_max_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] offset:8388607 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
;
; SPREFETCH-LABEL: prefetch_data_sgpr_max_offset:
; SPREFETCH: ; %bb.0: ; %entry
; SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x7fffff, null, 0
; SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_sgpr_max_offset:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
entry:
%gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388607
tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
@@ -55,6 +79,20 @@ entry:
}
define amdgpu_ps void @prefetch_data_sgpr_min_offset(ptr addrspace(4) inreg %ptr) {
+; GFX1250-LABEL: prefetch_data_sgpr_min_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] offset:-8388608 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-SDAG-LABEL: prefetch_data_sgpr_min_offset:
+; GFX1250-SPREFETCH-SDAG: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-SDAG-NEXT: s_mov_b64 s[2:3], lit64(0xffffffffff800000)
+; GFX1250-SPREFETCH-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SPREFETCH-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX1250-SPREFETCH-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GFX1250-SPREFETCH-SDAG-NEXT: s_endpgm
+;
; NOSPREFETCH-LABEL: prefetch_data_sgpr_min_offset:
; NOSPREFETCH: ; %bb.0: ; %entry
; NOSPREFETCH-NEXT: s_endpgm
@@ -68,6 +106,13 @@ define amdgpu_ps void @prefetch_data_sgpr_min_offset(ptr addrspace(4) inreg %ptr
; SPREFETCH-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
; SPREFETCH-SDAG-NEXT: s_endpgm
;
+; GFX1250-SPREFETCH-GISEL-LABEL: prefetch_data_sgpr_min_offset:
+; GFX1250-SPREFETCH-GISEL: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_u32 s0, s0, 0xff800000
+; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1
+; GFX1250-SPREFETCH-GISEL-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GFX1250-SPREFETCH-GISEL-NEXT: s_endpgm
+;
; SPREFETCH-GISEL-LABEL: prefetch_data_sgpr_min_offset:
; SPREFETCH-GISEL: ; %bb.0: ; %entry
; SPREFETCH-GISEL-NEXT: s_add_co_u32 s0, s0, 0xff800000
@@ -81,6 +126,18 @@ entry:
}
define amdgpu_ps void @prefetch_data_sgpr_too_large_offset(ptr addrspace(4) inreg %ptr) {
+; GFX1250-LABEL: prefetch_data_sgpr_too_large_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0x800000
+; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-SDAG-LABEL: prefetch_data_sgpr_too_large_offset:
+; GFX1250-SPREFETCH-SDAG: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x800000
+; GFX1250-SPREFETCH-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GFX1250-SPREFETCH-SDAG-NEXT: s_endpgm
+;
; NOSPREFETCH-LABEL: prefetch_data_sgpr_too_large_offset:
; NOSPREFETCH: ; %bb.0: ; %entry
; NOSPREFETCH-NEXT: s_endpgm
@@ -91,6 +148,13 @@ define amdgpu_ps void @prefetch_data_sgpr_too_large_offset(ptr addrspace(4) inre
; SPREFETCH-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
; SPREFETCH-SDAG-NEXT: s_endpgm
;
+; GFX1250-SPREFETCH-GISEL-LABEL: prefetch_data_sgpr_too_large_offset:
+; GFX1250-SPREFETCH-GISEL: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800000
+; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0
+; GFX1250-SPREFETCH-GISEL-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GFX1250-SPREFETCH-GISEL-NEXT: s_endpgm
+;
; SPREFETCH-GISEL-LABEL: prefetch_data_sgpr_too_large_offset:
; SPREFETCH-GISEL: ; %bb.0: ; %entry
; SPREFETCH-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800000
@@ -105,15 +169,113 @@ entry:
; Check divergent address
-define amdgpu_ps void @prefetch_data_vgpr(ptr addrspace(1) %ptr) {
-; GCN-LABEL: prefetch_data_vgpr:
-; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_endpgm
+define amdgpu_ps void @prefetch_data_vgpr_global(ptr addrspace(1) %ptr) {
+; GFX1250-LABEL: prefetch_data_vgpr_global:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_prefetch_b8 v[0:1], off scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_global:
+; GFX1250-SPREFETCH: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT: global_prefetch_b8 v[0:1], off scope:SCOPE_SYS
+; GFX1250-SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_vgpr_global:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_vgpr_global:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_endpgm
entry:
tail call void @llvm.prefetch.p1(ptr addrspace(1) %ptr, i32 0, i32 0, i32 1)
ret void
}
+define amdgpu_ps void @prefetch_data_vgpr_flat(ptr %ptr) {
+; GFX1250-LABEL: prefetch_data_vgpr_flat:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_flat:
+; GFX1250-SPREFETCH: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SYS
+; GFX1250-SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_vgpr_flat:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_vgpr_flat:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.pf(ptr %ptr, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_vgpr_offset_global(ptr addrspace(1) inreg %ptr, i32 %offset) {
+; GFX1250-LABEL: prefetch_data_sgpr_vgpr_offset_global:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_vgpr_offset_global:
+; GFX1250-SPREFETCH: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
+; GFX1250-SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_sgpr_vgpr_offset_global:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_sgpr_vgpr_offset_global:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_endpgm
+; GFX12-LABEL: prefetch_data_sgpr_vgpr_offset_global:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_endpgm
+; GFX11-LABEL: prefetch_data_sgpr_vgpr_offset_global:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(1) %ptr, i32 %offset
+ tail call void @llvm.prefetch.p1(ptr addrspace(1) %gep, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_vgpr_offset_flat(ptr inreg %ptr, i32 %offset) {
+; GFX1250-LABEL: prefetch_data_sgpr_vgpr_offset_flat:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: flat_prefetch_b8 v0, s[0:1] offset:128 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_vgpr_offset_flat:
+; GFX1250-SPREFETCH: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT: flat_prefetch_b8 v0, s[0:1] offset:128 scope:SCOPE_SYS
+; GFX1250-SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_sgpr_vgpr_offset_flat:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_sgpr_vgpr_offset_flat:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_endpgm
+; GFX12-LABEL: prefetch_data_sgpr_vgpr_offset_flat:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_endpgm
+; GFX11-LABEL: prefetch_data_sgpr_vgpr_offset_flat:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_endpgm
+entry:
+ %gep1 = getelementptr i8, ptr %ptr, i32 %offset
+ %gep2 = getelementptr i8, ptr %gep1, i32 128
+ tail call void @llvm.prefetch.pf(ptr %gep2, i32 0, i32 0, i32 1)
+ ret void
+}
+
; Check LDS and Scratch, we cannot prefetch it
define amdgpu_ps void @prefetch_data_lds(ptr addrspace(3) inreg %ptr) {
@@ -137,43 +299,59 @@ entry:
; Check supported address spaces
define amdgpu_ps void @prefetch_data_sgpr_flat(ptr inreg %ptr) {
-; NOSPREFETCH-LABEL: prefetch_data_sgpr_flat:
-; NOSPREFETCH: ; %bb.0: ; %entry
-; NOSPREFETCH-NEXT: s_endpgm
+; GFX1250-LABEL: prefetch_data_sgpr_flat:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: flat_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
;
; SPREFETCH-LABEL: prefetch_data_sgpr_flat:
; SPREFETCH: ; %bb.0: ; %entry
; SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
; SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_sgpr_flat:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
entry:
tail call void @llvm.prefetch.pf(ptr %ptr, i32 0, i32 0, i32 1)
ret void
}
define amdgpu_ps void @prefetch_data_sgpr_global(ptr addrspace(1) inreg %ptr) {
-; NOSPREFETCH-LABEL: prefetch_data_sgpr_global:
-; NOSPREFETCH: ; %bb.0: ; %entry
-; NOSPREFETCH-NEXT: s_endpgm
+; GFX1250-LABEL: prefetch_data_sgpr_global:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
;
; SPREFETCH-LABEL: prefetch_data_sgpr_global:
; SPREFETCH: ; %bb.0: ; %entry
; SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
; SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_sgpr_global:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
entry:
tail call void @llvm.prefetch.p1(ptr addrspace(1) %ptr, i32 0, i32 0, i32 1)
ret void
}
define amdgpu_ps void @prefetch_data_sgpr_constant_32bit(ptr addrspace(6) inreg %ptr) {
-; NOSPREFETCH-LABEL: prefetch_data_sgpr_constant_32bit:
-; NOSPREFETCH: ; %bb.0: ; %entry
-; NOSPREFETCH-NEXT: s_endpgm
+; GFX1250-LABEL: prefetch_data_sgpr_constant_32bit:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
;
; SPREFETCH-LABEL: prefetch_data_sgpr_constant_32bit:
; SPREFETCH: ; %bb.0: ; %entry
; SPREFETCH-NEXT: s_mov_b32 s1, 0
; SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
; SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_sgpr_constant_32bit:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
entry:
tail call void @llvm.prefetch.p6(ptr addrspace(6) %ptr, i32 0, i32 0, i32 1)
ret void
@@ -182,28 +360,36 @@ entry:
; I$ prefetch
define amdgpu_ps void @prefetch_inst_sgpr(ptr addrspace(4) inreg %ptr) {
-; NOSPREFETCH-LABEL: prefetch_inst_sgpr:
-; NOSPREFETCH: ; %bb.0: ; %entry
-; NOSPREFETCH-NEXT: s_endpgm
+; GFX1250-LABEL: prefetch_inst_sgpr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
;
; SPREFETCH-LABEL: prefetch_inst_sgpr:
; SPREFETCH: ; %bb.0: ; %entry
; SPREFETCH-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0
; SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_inst_sgpr:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
entry:
tail call void @llvm.prefetch.p4(ptr addrspace(4) %ptr, i32 0, i32 0, i32 0)
ret void
}
define amdgpu_ps void @prefetch_inst_sgpr_offset(ptr addrspace(4) inreg %ptr) {
-; NOSPREFETCH-LABEL: prefetch_inst_sgpr_offset:
-; NOSPREFETCH: ; %bb.0: ; %entry
-; NOSPREFETCH-NEXT: s_endpgm
+; GFX1250-LABEL: prefetch_inst_sgpr_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
;
; SPREFETCH-LABEL: prefetch_inst_sgpr_offset:
; SPREFETCH: ; %bb.0: ; %entry
; SPREFETCH-NEXT: s_prefetch_inst s[0:1], 0x80, null, 0
; SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_inst_sgpr_offset:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
entry:
%gep = getelementptr i8, ptr addrspace(4) %ptr, i32 128
tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)
@@ -213,14 +399,18 @@ entry:
; Check large offsets
define amdgpu_ps void @prefetch_inst_sgpr_max_offset(ptr addrspace(4) inreg %ptr) {
-; NOSPREFETCH-LABEL: prefetch_inst_sgpr_max_offset:
-; NOSPREFETCH: ; %bb.0: ; %entry
-; NOSPREFETCH-NEXT: s_endpgm
+; GFX1250-LABEL: prefetch_inst_sgpr_max_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
;
; SPREFETCH-LABEL: prefetch_inst_sgpr_max_offset:
; SPREFETCH: ; %bb.0: ; %entry
; SPREFETCH-NEXT: s_prefetch_inst s[0:1], 0x7fffff, null, 0
; SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_inst_sgpr_max_offset:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
entry:
%gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388607
tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)
@@ -228,6 +418,18 @@ entry:
}
define amdgpu_ps void @prefetch_inst_sgpr_min_offset(ptr addrspace(4) inreg %ptr) {
+; GFX1250-LABEL: prefetch_inst_sgpr_min_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-SDAG-LABEL: prefetch_inst_sgpr_min_offset:
+; GFX1250-SPREFETCH-SDAG: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-SDAG-NEXT: s_mov_b64 s[2:3], lit64(0xffffffffff800000)
+; GFX1250-SPREFETCH-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SPREFETCH-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX1250-SPREFETCH-SDAG-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0
+; GFX1250-SPREFETCH-SDAG-NEXT: s_endpgm
+;
; NOSPREFETCH-LABEL: prefetch_inst_sgpr_min_offset:
; NOSPREFETCH: ; %bb.0: ; %entry
; NOSPREFETCH-NEXT: s_endpgm
@@ -241,6 +443,13 @@ define amdgpu_ps void @prefetch_inst_sgpr_min_offset(ptr addrspace(4) inreg %ptr
; SPREFETCH-SDAG-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0
; SPREFETCH-SDAG-NEXT: s_endpgm
;
+; GFX1250-SPREFETCH-GISEL-LABEL: prefetch_inst_sgpr_min_offset:
+; GFX1250-SPREFETCH-GISEL: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_u32 s0, s0, 0xff800000
+; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1
+; GFX1250-SPREFETCH-GISEL-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0
+; GFX1250-SPREFETCH-GISEL-NEXT: s_endpgm
+;
; SPREFETCH-GISEL-LABEL: prefetch_inst_sgpr_min_offset:
; SPREFETCH-GISEL: ; %bb.0: ; %entry
; SPREFETCH-GISEL-NEXT: s_add_co_u32 s0, s0, 0xff800000
@@ -254,6 +463,16 @@ entry:
}
define amdgpu_ps void @prefetch_inst_sgpr_too_large_offset(ptr addrspace(4) inreg %ptr) {
+; GFX1250-LABEL: prefetch_inst_sgpr_too_large_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-SDAG-LABEL: prefetch_inst_sgpr_too_large_offset:
+; GFX1250-SPREFETCH-SDAG: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x800000
+; GFX1250-SPREFETCH-SDAG-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0
+; GFX1250-SPREFETCH-SDAG-NEXT: s_endpgm
+;
; NOSPREFETCH-LABEL: prefetch_inst_sgpr_too_large_offset:
; NOSPREFETCH: ; %bb.0: ; %entry
; NOSPREFETCH-NEXT: s_endpgm
@@ -264,6 +483,13 @@ define amdgpu_ps void @prefetch_inst_sgpr_too_large_offset(ptr addrspace(4) inre
; SPREFETCH-SDAG-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0
; SPREFETCH-SDAG-NEXT: s_endpgm
;
+; GFX1250-SPREFETCH-GISEL-LABEL: prefetch_inst_sgpr_too_large_offset:
+; GFX1250-SPREFETCH-GISEL: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800000
+; GFX1250-SPREFETCH-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0
+; GFX1250-SPREFETCH-GISEL-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0
+; GFX1250-SPREFETCH-GISEL-NEXT: s_endpgm
+;
; SPREFETCH-GISEL-LABEL: prefetch_inst_sgpr_too_large_offset:
; SPREFETCH-GISEL: ; %bb.0: ; %entry
; SPREFETCH-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800000
@@ -276,6 +502,282 @@ entry:
ret void
}
+; Check cache locality
+
+define amdgpu_ps void @prefetch_data_vgpr_flat_dev(ptr %ptr) {
+; GFX1250-LABEL: prefetch_data_vgpr_flat_dev:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_flat_dev:
+; GFX1250-SPREFETCH: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_DEV
+; GFX1250-SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_vgpr_flat_dev:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_vgpr_flat_dev:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.pf(ptr %ptr, i32 0, i32 1, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_vgpr_flat_se(ptr %ptr) {
+; GFX1250-LABEL: prefetch_data_vgpr_flat_se:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_flat_se:
+; GFX1250-SPREFETCH: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SE
+; GFX1250-SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_vgpr_flat_se:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_vgpr_flat_se:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.pf(ptr %ptr, i32 0, i32 2, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_vgpr_flat_cu(ptr %ptr) {
+; GL2-ONLY-LABEL: prefetch_data_vgpr_flat_cu:
+; GL2-ONLY: ; %bb.0: ; %entry
+; GL2-ONLY-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SE
+; GL2-ONLY-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_flat_cu:
+; GFX1250-SPREFETCH: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SE
+; GFX1250-SPREFETCH-NEXT: s_endpgm
+;
+; SAFE-CU-LABEL: prefetch_data_vgpr_flat_cu:
+; SAFE-CU: ; %bb.0: ; %entry
+; SAFE-CU-NEXT: flat_prefetch_b8 v[0:1]
+; SAFE-CU-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_vgpr_flat_cu:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_vgpr_flat_cu:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.pf(ptr %ptr, i32 0, i32 3, i32 1)
+ ret void
+}
+
+; flat offset
+
+define amdgpu_ps void @prefetch_data_vgpr_flat_offset(ptr %ptr) {
+; GFX1250-LABEL: prefetch_data_vgpr_flat_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: flat_prefetch_b8 v[0:1] offset:512 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_flat_offset:
+; GFX1250-SPREFETCH: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT: flat_prefetch_b8 v[0:1] offset:512 scope:SCOPE_SYS
+; GFX1250-SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_vgpr_flat_offset:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_vgpr_flat_offset:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_endpgm
+entry:
+ %gep = getelementptr float, ptr %ptr, i32 128
+ tail call void @llvm.prefetch.pf(ptr %gep, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_vgpr_global_offset(ptr addrspace(1) %ptr) {
+; GFX1250-LABEL: prefetch_data_vgpr_global_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_prefetch_b8 v[0:1], off offset:512 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_global_offset:
+; GFX1250-SPREFETCH: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT: global_prefetch_b8 v[0:1], off offset:512 scope:SCOPE_SYS
+; GFX1250-SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_vgpr_global_offset:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_vgpr_global_offset:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_endpgm
+entry:
+ %gep = getelementptr float, ptr addrspace(1) %ptr, i32 128
+ tail call void @llvm.prefetch.p1(ptr addrspace(1) %gep, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_vgpr_global_saddr(ptr addrspace(1) inreg %ptr, i32 %voffset) {
+; GFX1250-LABEL: prefetch_data_vgpr_global_saddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_global_saddr:
+; GFX1250-SPREFETCH: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
+; GFX1250-SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_vgpr_global_saddr:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_vgpr_global_saddr:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(1) %ptr, i32 %voffset
+ tail call void @llvm.prefetch.p1(ptr addrspace(1) %gep, i32 0, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_vgpr_global_saddr_offset(ptr addrspace(1) inreg %ptr, i32 %voffset) {
+; GFX1250-LABEL: prefetch_data_vgpr_global_saddr_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] offset:128 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_global_saddr_offset:
+; GFX1250-SPREFETCH: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT: global_prefetch_b8 v0, s[0:1] offset:128 scope:SCOPE_SYS
+; GFX1250-SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_vgpr_global_saddr_offset:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_vgpr_global_saddr_offset:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_endpgm
+entry:
+ %gep1 = getelementptr i8, ptr addrspace(1) %ptr, i32 %voffset
+ %gep2 = getelementptr i8, ptr addrspace(1) %gep1, i32 128
+ tail call void @llvm.prefetch.p1(ptr addrspace(1) %gep2, i32 0, i32 0, i32 1)
+ ret void
+}
+
+; Cannot prefetch I$ with flat or global instructions.
+
+define amdgpu_ps void @prefetch_inst_vgpr_global(ptr addrspace(1) %ptr) {
+; GCN-LABEL: prefetch_inst_vgpr_global:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.p1(ptr addrspace(1) %ptr, i32 0, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_inst_vgpr_flat(ptr %ptr) {
+; GCN-LABEL: prefetch_inst_vgpr_flat:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.pf(ptr %ptr, i32 0, i32 0, i32 0)
+ ret void
+}
+
+; Force vector prefetch for uniform address with rw = 1 argument.
+
+define amdgpu_ps void @prefetch_data_sgpr_flat_force_vector(ptr inreg %ptr) {
+; GFX1250-LABEL: prefetch_data_sgpr_flat_force_vector:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: flat_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_flat_force_vector:
+; GFX1250-SPREFETCH: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-SPREFETCH-NEXT: flat_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
+; GFX1250-SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_sgpr_flat_force_vector:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_sgpr_flat_force_vector:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-SPREFETCH-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.pf(ptr %ptr, i32 1, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_global_force_vector(ptr addrspace(1) inreg %ptr) {
+; GFX1250-LABEL: prefetch_data_sgpr_global_force_vector:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_global_force_vector:
+; GFX1250-SPREFETCH: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-SPREFETCH-NEXT: global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
+; GFX1250-SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_sgpr_global_force_vector:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_sgpr_global_force_vector:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-SPREFETCH-NEXT: s_endpgm
+entry:
+ tail call void @llvm.prefetch.p1(ptr addrspace(1) %ptr, i32 1, i32 0, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @prefetch_data_sgpr_global_saddr_force_vector(ptr addrspace(1) inreg %ptr) {
+; GFX1250-LABEL: prefetch_data_sgpr_global_saddr_force_vector:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: global_prefetch_b8 v0, s[0:1] offset:1024 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_global_saddr_force_vector:
+; GFX1250-SPREFETCH: ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-SPREFETCH-NEXT: global_prefetch_b8 v0, s[0:1] offset:1024 scope:SCOPE_SYS
+; GFX1250-SPREFETCH-NEXT: s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_sgpr_global_saddr_force_vector:
+; NOSPREFETCH: ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_sgpr_global_saddr_force_vector:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_prefetch_data s[0:1], 0x400, null, 0
+; GFX12-SPREFETCH-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(1) %ptr, i32 1024
+ tail call void @llvm.prefetch.p1(ptr addrspace(1) %gep, i32 1, i32 0, i32 1)
+ ret void
+}
+
declare void @llvm.prefetch.pf(ptr nocapture readonly, i32, i32, i32)
declare void @llvm.prefetch.p1(ptr addrspace(1) nocapture readonly, i32, i32, i32)
declare void @llvm.prefetch.p3(ptr addrspace(3) nocapture readonly, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
index 874dece..1e6b77e 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch < %s | FileCheck --check-prefix=GFX12 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch -mattr=+safe-smem-prefetch < %s | FileCheck --check-prefix=GFX12-SPREFETCH %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-loop-prefetch < %s | FileCheck --check-prefix=GFX1250 %s
define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) {
; GFX12-LABEL: copy_flat:
@@ -55,6 +56,33 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s
; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB0_2
; GFX12-SPREFETCH-NEXT: .LBB0_3: ; %for.end
; GFX12-SPREFETCH-NEXT: s_endpgm
+;
+; GFX1250-LABEL: copy_flat:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_cmp_eq_u32 s6, 0
+; GFX1250-NEXT: s_cbranch_scc1 .LBB0_3
+; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
+; GFX1250-NEXT: .LBB0_2: ; %for.body
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: flat_load_b128 v[2:5], v0, s[2:3] offset:-176
+; GFX1250-NEXT: flat_prefetch_b8 v0, s[2:3] scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s6, s6, -1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
+; GFX1250-NEXT: s_cmp_lg_u32 s6, 0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b128 v0, v[2:5], s[0:1]
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
+; GFX1250-NEXT: s_cbranch_scc1 .LBB0_2
+; GFX1250-NEXT: .LBB0_3: ; %for.end
+; GFX1250-NEXT: s_endpgm
entry:
%cmp6.not = icmp eq i32 %n, 0
br i1 %cmp6.not, label %for.end, label %for.body
@@ -123,6 +151,33 @@ define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrsp
; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB1_2
; GFX12-SPREFETCH-NEXT: .LBB1_3: ; %for.end
; GFX12-SPREFETCH-NEXT: s_endpgm
+;
+; GFX1250-LABEL: copy_global:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_cmp_eq_u32 s6, 0
+; GFX1250-NEXT: s_cbranch_scc1 .LBB1_3
+; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
+; GFX1250-NEXT: .LBB1_2: ; %for.body
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: global_load_b128 v[2:5], v0, s[2:3] offset:-176
+; GFX1250-NEXT: global_prefetch_b8 v0, s[2:3] scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s6, s6, -1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
+; GFX1250-NEXT: s_cmp_lg_u32 s6, 0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b128 v0, v[2:5], s[0:1]
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
+; GFX1250-NEXT: s_cbranch_scc1 .LBB1_2
+; GFX1250-NEXT: .LBB1_3: ; %for.end
+; GFX1250-NEXT: s_endpgm
entry:
%cmp6.not = icmp eq i32 %n, 0
br i1 %cmp6.not, label %for.end, label %for.body
@@ -193,6 +248,34 @@ define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addr
; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB2_2
; GFX12-SPREFETCH-NEXT: .LBB2_3: ; %for.end
; GFX12-SPREFETCH-NEXT: s_endpgm
+;
+; GFX1250-LABEL: copy_constant:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_cmp_eq_u32 s6, 0
+; GFX1250-NEXT: s_cbranch_scc1 .LBB2_3
+; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: .LBB2_2: ; %for.body
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_prefetch_b8 v0, s[2:3] offset:176 scope:SCOPE_SE
+; GFX1250-NEXT: s_load_b128 s[8:11], s[2:3], 0x0
+; GFX1250-NEXT: s_add_co_i32 s6, s6, -1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
+; GFX1250-NEXT: s_cmp_lg_u32 s6, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[2:3], s[8:9]
+; GFX1250-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
+; GFX1250-NEXT: global_store_b128 v0, v[2:5], s[0:1]
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
+; GFX1250-NEXT: s_cbranch_scc1 .LBB2_2
+; GFX1250-NEXT: .LBB2_3: ; %for.end
+; GFX1250-NEXT: s_endpgm
entry:
%cmp6.not = icmp eq i32 %n, 0
br i1 %cmp6.not, label %for.end, label %for.body
@@ -262,6 +345,29 @@ define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspa
; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB3_1
; GFX12-SPREFETCH-NEXT: .LBB3_2: ; %for.end
; GFX12-SPREFETCH-NEXT: s_endpgm
+;
+; GFX1250-LABEL: copy_local:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_cmp_eq_u32 s2, 0
+; GFX1250-NEXT: s_cbranch_scc1 .LBB3_2
+; GFX1250-NEXT: .LBB3_1: ; %for.body
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v4, s0
+; GFX1250-NEXT: s_add_co_i32 s2, s2, -1
+; GFX1250-NEXT: s_add_co_i32 s0, s0, 16
+; GFX1250-NEXT: s_add_co_i32 s1, s1, 16
+; GFX1250-NEXT: ds_load_2addr_b32 v[0:1], v2 offset0:2 offset1:3
+; GFX1250-NEXT: ds_load_2addr_b32 v[2:3], v2 offset1:1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1250-NEXT: s_wait_dscnt 0x1
+; GFX1250-NEXT: ds_store_2addr_b32 v4, v0, v1 offset0:2 offset1:3
+; GFX1250-NEXT: s_wait_dscnt 0x1
+; GFX1250-NEXT: ds_store_2addr_b32 v4, v2, v3 offset1:1
+; GFX1250-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX1250-NEXT: .LBB3_2: ; %for.end
+; GFX1250-NEXT: s_endpgm
entry:
%cmp6.not = icmp eq i32 %n, 0
br i1 %cmp6.not, label %for.end, label %for.body
@@ -280,3 +386,267 @@ for.body: ; preds = %entry, %for.body
for.end: ; preds = %for.body, %entry
ret void
}
+
+define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) {
+; GFX12-LABEL: copy_flat_divergent:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x34
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_cmp_eq_u32 s0, 0
+; GFX12-NEXT: s_cbranch_scc1 .LBB4_3
+; GFX12-NEXT: ; %bb.1: ; %for.body.preheader
+; GFX12-NEXT: s_load_b128 s[4:7], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_add_co_u32 v2, s1, s6, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1
+; GFX12-NEXT: v_add_co_u32 v0, s1, s4, v0
+; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s1
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX12-NEXT: .LBB4_2: ; %for.body
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: flat_load_b128 v[4:7], v[2:3] offset:-176
+; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, 16
+; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX12-NEXT: s_add_co_i32 s0, s0, -1
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_cmp_lg_u32 s0, 0
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b128 v[0:1], v[4:7]
+; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16
+; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX12-NEXT: .LBB4_3: ; %for.end
+; GFX12-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: copy_flat_divergent:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_load_b32 s0, s[4:5], 0x34
+; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
+; GFX12-SPREFETCH-NEXT: s_cmp_eq_u32 s0, 0
+; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB4_3
+; GFX12-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader
+; GFX12-SPREFETCH-NEXT: s_load_b128 s[4:7], s[4:5], 0x24
+; GFX12-SPREFETCH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SPREFETCH-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
+; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, s1, s6, v0
+; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1
+; GFX12-SPREFETCH-NEXT: v_add_co_u32 v0, s1, s4, v0
+; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2
+; GFX12-SPREFETCH-NEXT: s_wait_alu 0xf1ff
+; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s1
+; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX12-SPREFETCH-NEXT: .LBB4_2: ; %for.body
+; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-SPREFETCH-NEXT: flat_load_b128 v[4:7], v[2:3] offset:-176
+; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, v2, 16
+; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffd
+; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX12-SPREFETCH-NEXT: s_add_co_i32 s0, s0, -1
+; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffe
+; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s0, 0
+; GFX12-SPREFETCH-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SPREFETCH-NEXT: flat_store_b128 v[0:1], v[4:7]
+; GFX12-SPREFETCH-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16
+; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffd
+; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX12-SPREFETCH-NEXT: .LBB4_3: ; %for.end
+; GFX12-SPREFETCH-NEXT: s_endpgm
+;
+; GFX1250-LABEL: copy_flat_divergent:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x34
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_cmp_eq_u32 s0, 0
+; GFX1250-NEXT: s_cbranch_scc1 .LBB4_3
+; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader
+; GFX1250-NEXT: s_load_b128 s[4:7], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 4, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[6:7], v[0:1]
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[4:5], v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 0xb0, v[2:3]
+; GFX1250-NEXT: .LBB4_2: ; %for.body
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: flat_load_b128 v[4:7], v[2:3] offset:-176
+; GFX1250-NEXT: flat_prefetch_b8 v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 16, v[2:3]
+; GFX1250-NEXT: s_add_co_i32 s0, s0, -1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[4:7]
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], 16, v[0:1]
+; GFX1250-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX1250-NEXT: .LBB4_3: ; %for.end
+; GFX1250-NEXT: s_endpgm
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %s.tid = getelementptr inbounds <4 x i32>, ptr %s, i32 %tid
+ %d.tid = getelementptr inbounds <4 x i32>, ptr %d, i32 %tid
+ %cmp6.not = icmp eq i32 %n, 0
+ br i1 %cmp6.not, label %for.end, label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %idxprom = zext i32 %i.07 to i64
+ %arrayidx = getelementptr inbounds <4 x i32>, ptr %s.tid, i64 %idxprom
+ %ld = load <4 x i32>, ptr %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds <4 x i32>, ptr %d.tid, i64 %idxprom
+ store <4 x i32> %ld, ptr %arrayidx2, align 4
+ %inc = add nuw i32 %i.07, 1
+ %exitcond.not = icmp eq i32 %inc, %n
+ br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+define amdgpu_kernel void @copy_global_divergent(ptr addrspace(1) nocapture %d, ptr addrspace(1) nocapture readonly %s, i32 %n) {
+; GFX12-LABEL: copy_global_divergent:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x34
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_cmp_eq_u32 s0, 0
+; GFX12-NEXT: s_cbranch_scc1 .LBB5_3
+; GFX12-NEXT: ; %bb.1: ; %for.body.preheader
+; GFX12-NEXT: s_load_b128 s[4:7], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_add_co_u32 v2, s1, s6, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1
+; GFX12-NEXT: v_add_co_u32 v0, s1, s4, v0
+; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s1
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX12-NEXT: .LBB5_2: ; %for.body
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: global_load_b128 v[4:7], v[2:3], off offset:-176
+; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, 16
+; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX12-NEXT: s_add_co_i32 s0, s0, -1
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_cmp_lg_u32 s0, 0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b128 v[0:1], v[4:7], off
+; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16
+; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-NEXT: s_cbranch_scc1 .LBB5_2
+; GFX12-NEXT: .LBB5_3: ; %for.end
+; GFX12-NEXT: s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: copy_global_divergent:
+; GFX12-SPREFETCH: ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT: s_load_b32 s0, s[4:5], 0x34
+; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
+; GFX12-SPREFETCH-NEXT: s_cmp_eq_u32 s0, 0
+; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB5_3
+; GFX12-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader
+; GFX12-SPREFETCH-NEXT: s_load_b128 s[4:7], s[4:5], 0x24
+; GFX12-SPREFETCH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SPREFETCH-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
+; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, s1, s6, v0
+; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1
+; GFX12-SPREFETCH-NEXT: v_add_co_u32 v0, s1, s4, v0
+; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2
+; GFX12-SPREFETCH-NEXT: s_wait_alu 0xf1ff
+; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s1
+; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX12-SPREFETCH-NEXT: .LBB5_2: ; %for.body
+; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-SPREFETCH-NEXT: global_load_b128 v[4:7], v[2:3], off offset:-176
+; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, v2, 16
+; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffd
+; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX12-SPREFETCH-NEXT: s_add_co_i32 s0, s0, -1
+; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffe
+; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s0, 0
+; GFX12-SPREFETCH-NEXT: s_wait_loadcnt 0x0
+; GFX12-SPREFETCH-NEXT: global_store_b128 v[0:1], v[4:7], off
+; GFX12-SPREFETCH-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16
+; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffd
+; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB5_2
+; GFX12-SPREFETCH-NEXT: .LBB5_3: ; %for.end
+; GFX12-SPREFETCH-NEXT: s_endpgm
+;
+; GFX1250-LABEL: copy_global_divergent:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x34
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_cmp_eq_u32 s0, 0
+; GFX1250-NEXT: s_cbranch_scc1 .LBB5_3
+; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader
+; GFX1250-NEXT: s_load_b128 s[4:7], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 4, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[6:7], v[0:1]
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[4:5], v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 0xb0, v[2:3]
+; GFX1250-NEXT: .LBB5_2: ; %for.body
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: global_load_b128 v[4:7], v[2:3], off offset:-176
+; GFX1250-NEXT: global_prefetch_b8 v[2:3], off scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 16, v[2:3]
+; GFX1250-NEXT: s_add_co_i32 s0, s0, -1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b128 v[0:1], v[4:7], off
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], 16, v[0:1]
+; GFX1250-NEXT: s_cbranch_scc1 .LBB5_2
+; GFX1250-NEXT: .LBB5_3: ; %for.end
+; GFX1250-NEXT: s_endpgm
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %s.tid = getelementptr inbounds <4 x i32>, ptr addrspace(1) %s, i32 %tid
+ %d.tid = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d, i32 %tid
+ %cmp6.not = icmp eq i32 %n, 0
+ br i1 %cmp6.not, label %for.end, label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %idxprom = zext i32 %i.07 to i64
+ %arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(1) %s.tid, i64 %idxprom
+ %ld = load <4 x i32>, ptr addrspace(1) %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d.tid, i64 %idxprom
+ store <4 x i32> %ld, ptr addrspace(1) %arrayidx2, align 4
+ %inc = add nuw i32 %i.07, 1
+ %exitcond.not = icmp eq i32 %inc, %n
+ br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll
new file mode 100644
index 0000000..11cda2d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll
@@ -0,0 +1,634 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_bf16lo(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_bf16lo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_bf16hi_bf16hi_bf16hi_int(i32 %src0, i32 %src1, i32 %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16hi_bf16hi_bf16hi_int:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.hi = lshr i32 %src0, 16
+ %src1.hi = lshr i32 %src1, 16
+ %src2.hi = lshr i32 %src2, 16
+ %src0.i16 = trunc i32 %src0.hi to i16
+ %src1.i16 = trunc i32 %src1.hi to i16
+ %src2.i16 = trunc i32 %src2.hi to i16
+ %src0.fp16 = bitcast i16 %src0.i16 to bfloat
+ %src1.fp16 = bitcast i16 %src1.i16 to bfloat
+ %src2.fp16 = bitcast i16 %src2.i16 to bfloat
+ %src0.ext = fpext bfloat %src0.fp16 to float
+ %src1.ext = fpext bfloat %src1.fp16 to float
+ %src2.ext = fpext bfloat %src2.fp16 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_bf16hi_bf16hi_bf16hi_elt(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16hi_bf16hi_bf16hi_elt:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.hi = extractelement <2 x bfloat> %src0, i32 1
+ %src1.hi = extractelement <2 x bfloat> %src1, i32 1
+ %src2.hi = extractelement <2 x bfloat> %src2, i32 1
+ %src0.ext = fpext bfloat %src0.hi to float
+ %src1.ext = fpext bfloat %src1.hi to float
+ %src2.ext = fpext bfloat %src2.hi to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+define <2 x float> @v_mad_mix_v2f32(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1
+; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
+ %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
+ %src2.ext = fpext <2 x bfloat> %src2 to <2 x float>
+ %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
+ ret <2 x float> %result
+}
+
+define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32_shuffle:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v5, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1
+; GFX1250-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1] op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.shuf = shufflevector <2 x bfloat> %src0, <2 x bfloat> undef, <2 x i32> <i32 1, i32 0>
+ %src1.shuf = shufflevector <2 x bfloat> %src1, <2 x bfloat> undef, <2 x i32> <i32 0, i32 1>
+ %src2.shuf = shufflevector <2 x bfloat> %src2, <2 x bfloat> undef, <2 x i32> <i32 1, i32 1>
+ %src0.ext = fpext <2 x bfloat> %src0.shuf to <2 x float>
+ %src1.ext = fpext <2 x bfloat> %src1.shuf to <2 x float>
+ %src2.ext = fpext <2 x bfloat> %src2.shuf to <2 x float>
+ %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
+ ret <2 x float> %result
+}
+
+define float @v_mad_mix_f32_negbf16lo_bf16lo_bf16lo(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_negbf16lo_bf16lo_bf16lo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, -v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %src0.ext.neg = fneg float %src0.ext
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext.neg, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_absbf16lo_bf16lo_bf16lo(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_absbf16lo_bf16lo_bf16lo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, |v0|, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext)
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext.abs, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_negabsbf16lo_bf16lo_bf16lo(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_negabsbf16lo_bf16lo_bf16lo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, -|v0|, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext)
+ %src0.ext.neg.abs = fneg float %src0.ext.abs
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext.neg.abs, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_f32(bfloat %src0, bfloat %src1, float %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_negf32(bfloat %src0, bfloat %src1, float %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_negf32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, -v2 op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.neg = fneg float %src2
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.neg)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_absf32(bfloat %src0, bfloat %src1, float %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_absf32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, |v2| op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.abs = call float @llvm.fabs.f32(float %src2)
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.abs)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_negabsf32(bfloat %src0, bfloat %src1, float %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_negabsf32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, -|v2| op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.abs = call float @llvm.fabs.f32(float %src2)
+ %src2.neg.abs = fneg float %src2.abs
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.neg.abs)
+ ret float %result
+}
+
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_f32imm1(bfloat %src0, bfloat %src1) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_f32imm1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s0, 1.0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, s0 op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float 1.0)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_f32imminv2pi(bfloat %src0, bfloat %src1) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_f32imminv2pi:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s0, 0.15915494
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, s0 op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float 0x3FC45F3060000000)
+ ret float %result
+}
+
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_cvtbf16imminv2pi(bfloat %src0, bfloat %src1) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_cvtbf16imminv2pi:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s0, 0x3e230000
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, s0 op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2 = fpext bfloat 0xR3e23 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
+ ret float %result
+}
+
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_cvtbf16imm63(bfloat %src0, bfloat %src1) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_cvtbf16imm63:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s0, 0x367c0000
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, s0 op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2 = fpext bfloat 0xR367c to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
+ ret float %result
+}
+
+define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x bfloat> %src0, <2 x bfloat> %src1) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32_f32imm1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v4, 16, v1
+; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[4:5], 1.0 op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
+ %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
+ %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> <float 1.0, float 1.0>)
+ ret <2 x float> %result
+}
+
+define <2 x float> @v_mad_mix_v2f32_cvtbf16imminv2pi(<2 x bfloat> %src0, <2 x bfloat> %src1) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32_cvtbf16imminv2pi:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v4, 16, v1
+; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX1250-NEXT: s_mov_b32 s0, 0x3e230000
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[4:5], s[0:1] op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
+ %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
+ %src2 = fpext <2 x bfloat> <bfloat 0xR3e23, bfloat 0xR3e23> to <2 x float>
+ %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2)
+ ret <2 x float> %result
+}
+
+define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x bfloat> %src0, <2 x bfloat> %src1) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32_f32imminv2pi:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v4, 16, v1
+; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[4:5], 0.15915494 op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
+ %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
+ %src2 = fpext <2 x bfloat> <bfloat 0xR3e23, bfloat 0xR3e23> to <2 x float>
+ %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> <float 0x3FC45F3060000000, float 0x3FC45F3060000000>)
+ ret <2 x float> %result
+}
+
+define float @v_mad_mix_clamp_f32_bf16hi_bf16hi_bf16hi_elt(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_clamp_f32_bf16hi_bf16hi_bf16hi_elt:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.hi = extractelement <2 x bfloat> %src0, i32 1
+ %src1.hi = extractelement <2 x bfloat> %src1, i32 1
+ %src2.hi = extractelement <2 x bfloat> %src2, i32 1
+ %src0.ext = fpext bfloat %src0.hi to float
+ %src1.ext = fpext bfloat %src1.hi to float
+ %src2.ext = fpext bfloat %src2.hi to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ %max = call float @llvm.maxnum.f32(float %result, float 0.0)
+ %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+ ret float %clamp
+}
+
+define float @no_mix_simple(float %src0, float %src1, float %src2) #0 {
+; GFX1250-LABEL: no_mix_simple:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2)
+ ret float %result
+}
+
+define float @no_mix_simple_fabs(float %src0, float %src1, float %src2) #0 {
+; GFX1250-LABEL: no_mix_simple_fabs:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_f32 v0, |v0|, v1, v2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.fabs = call float @llvm.fabs.f32(float %src0)
+ %result = call float @llvm.fmuladd.f32(float %src0.fabs, float %src1, float %src2)
+ ret float %result
+}
+
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_bf16lo_f32_denormals(bfloat %src0, bfloat %src1, bfloat %src2) #1 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_bf16lo_f32_denormals:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_f32_denormals(bfloat %src0, bfloat %src1, float %src2) #1 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_f32_denormals:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_bf16lo_f32_denormals_fmulfadd(bfloat %src0, bfloat %src1, bfloat %src2) #1 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_bf16lo_f32_denormals_fmulfadd:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v2 :: v_dual_mul_f32 v0, v0, v1
+; GFX1250-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %mul = fmul float %src0.ext, %src1.ext
+ %result = fadd float %mul, %src2.ext
+ ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_f32_denormals_fmulfadd(bfloat %src0, bfloat %src1, float %src2) #1 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_f32_denormals_fmulfadd:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1250-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %mul = fmul float %src0.ext, %src1.ext
+ %result = fadd float %mul, %src2
+ ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_bf16lo_f32_flush_fmulfadd(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_bf16lo_f32_flush_fmulfadd:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %mul = fmul contract float %src0.ext, %src1.ext
+ %result = fadd contract float %mul, %src2.ext
+ ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_f32_flush_fmulfadd(bfloat %src0, bfloat %src1, float %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_f32_flush_fmulfadd:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %mul = fmul contract float %src0.ext, %src1.ext
+ %result = fadd contract float %mul, %src2
+ ret float %result
+}
+
+define float @v_mad_mix_f32_negprecvtbf16lo_bf16lo_bf16lo(i32 %src0.arg, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_negprecvtbf16lo_bf16lo_bf16lo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, -v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat>
+ %src0 = extractelement <2 x bfloat> %src0.arg.bc, i32 0
+ %src0.neg = fneg bfloat %src0
+ %src0.ext = fpext bfloat %src0.neg to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+
+define float @v_mad_mix_f32_precvtnegbf16hi_abs_bf16lo_bf16lo(i32 %src0.arg, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_precvtnegbf16hi_abs_bf16lo_bf16lo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, |v0|, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat>
+ %src0 = extractelement <2 x bfloat> %src0.arg.bc, i32 1
+ %src0.neg = fneg bfloat %src0
+ %src0.ext = fpext bfloat %src0.neg to float
+ %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext)
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext.abs, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_precvtabsbf16hi_bf16lo_bf16lo(i32 %src0.arg, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_precvtabsbf16hi_bf16lo_bf16lo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat>
+ %src0 = extractelement <2 x bfloat> %src0.arg.bc, i32 1
+ %src0.abs = call bfloat @llvm.fabs.bf16(bfloat %src0)
+ %src0.ext = fpext bfloat %src0.abs to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_preextractfneg_bf16hi_bf16lo_bf16lo(i32 %src0.arg, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_preextractfneg_bf16hi_bf16lo_bf16lo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, -v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat>
+ %fneg = fneg <2 x bfloat> %src0.arg.bc
+ %src0 = extractelement <2 x bfloat> %fneg, i32 1
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_preextractfabs_bf16hi_bf16lo_bf16lo(i32 %src0.arg, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_preextractfabs_bf16hi_bf16lo_bf16lo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat>
+ %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %src0.arg.bc)
+ %src0 = extractelement <2 x bfloat> %fabs, i32 1
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_preextractfabsfneg_bf16hi_bf16lo_bf16lo(i32 %src0.arg, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_preextractfabsfneg_bf16hi_bf16lo_bf16lo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, -|v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat>
+ %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %src0.arg.bc)
+ %fneg.fabs = fneg <2 x bfloat> %fabs
+ %src0 = extractelement <2 x bfloat> %fneg.fabs, i32 1
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_bf16lo_bf16lo_all_cast_from_half(half %src0, half %src1, half %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_bf16lo_all_cast_from_half:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v3, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_fmac_f32_e32 v0, v3, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.bf16 = bitcast half %src0 to bfloat
+ %src1.bf16 = bitcast half %src1 to bfloat
+ %src2.bf16 = bitcast half %src2 to bfloat
+ %src0.ext = fpext bfloat %src0.bf16 to float
+ %src1.ext = fpext bfloat %src1.bf16 to float
+ %src2.ext = fpext bfloat %src2.bf16 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+define float @v_mad_mix_f32_bf16lo_cast_from_half_bf16lo_bf16lo(half %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_f32_bf16lo_cast_from_half_bf16lo_bf16lo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[0,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.bf16 = bitcast half %src0 to bfloat
+ %src0.ext = fpext bfloat %src0.bf16 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ ret float %result
+}
+
+define amdgpu_kernel void @test_fma_mix_f32_bf16_src2_bf16lo(float %x, i32 %y, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_fma_mix_f32_bf16_src2_bf16lo:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, s0, 0, s1 op_sel_hi:[0,0,1]
+; GFX1250-NEXT: s_mov_b32 s0, 0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX1250-NEXT: s_endpgm
+entry:
+ %v0 = shl i32 %y, 16
+ %v1 = bitcast i32 %v0 to float
+ %mul7 = fmul contract float %x, 0.000000e+00
+ %add2 = fadd contract float %mul7, %v1
+ %v2 = fcmp uno float %add2, 0.000000e+00
+ %v3 = select i1 %v2, i64 1, i64 0
+ store i64 %v3, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+declare bfloat @llvm.fabs.bf16(bfloat) #2
+declare <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat>) #2
+declare float @llvm.fabs.f32(float) #2
+declare float @llvm.minnum.f32(float, float) #2
+declare float @llvm.maxnum.f32(float, float) #2
+declare float @llvm.fmuladd.f32(float, float, float) #2
+declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #2
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
+attributes #1 = { nounwind "denormal-fp-math-f32"="ieee,ieee" }
+attributes #2 = { nounwind readnone speculatable }
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll
new file mode 100644
index 0000000..393581f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll
@@ -0,0 +1,167 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s
+
+define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixhi_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ %cvt.result = fptrunc float %result to bfloat
+ %vec.result = insertelement <2 x bfloat> undef, bfloat %cvt.result, i32 1
+ ret <2 x bfloat> %vec.result
+}
+
+define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_constlo(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_constlo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v3, 0x3f80
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_fma_mixhi_bf16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: v_mov_b32_e32 v0, v3
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ %cvt.result = fptrunc float %result to bfloat
+ %vec.result = insertelement <2 x bfloat> <bfloat 1.0, bfloat undef>, bfloat %cvt.result, i32 1
+ ret <2 x bfloat> %vec.result
+}
+
+define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_reglo(bfloat %src0, bfloat %src1, bfloat %src2, bfloat %lo) #0 {
+; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_reglo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixhi_bf16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_e32 v0, v3
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ %cvt.result = fptrunc float %result to bfloat
+ %vec = insertelement <2 x bfloat> undef, bfloat %lo, i32 0
+ %vec.result = insertelement <2 x bfloat> %vec, bfloat %cvt.result, i32 1
+ ret <2 x bfloat> %vec.result
+}
+
+define i32 @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_intpack(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_intpack:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ %cvt.result = fptrunc float %result to bfloat
+ %bc = bitcast bfloat %cvt.result to i16
+ %ext = zext i16 %bc to i32
+ %shr = shl i32 %ext, 16
+ ret i32 %shr
+}
+
+define i32 @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_intpack_sext(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_intpack_sext:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ %cvt.result = fptrunc float %result to bfloat
+ %bc = bitcast bfloat %cvt.result to i16
+ %ext = sext i16 %bc to i32
+ %shr = shl i32 %ext, 16
+ ret i32 %shr
+}
+
+define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_precvt(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_precvt:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ %max = call float @llvm.maxnum.f32(float %result, float 0.0)
+ %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+ %cvt.result = fptrunc float %clamp to bfloat
+ %vec.result = insertelement <2 x bfloat> undef, bfloat %cvt.result, i32 1
+ ret <2 x bfloat> %vec.result
+}
+
+define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixhi_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ %cvt.result = fptrunc float %result to bfloat
+ %max = call bfloat @llvm.maxnum.bf16(bfloat %cvt.result, bfloat 0.0)
+ %clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
+ %vec.result = insertelement <2 x bfloat> undef, bfloat %clamp, i32 1
+ ret <2 x bfloat> %vec.result
+}
+
+define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt_multi_use(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt_multi_use:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: v_fma_mixhi_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT: global_store_b16 v[0:1], v3, off scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ %cvt.result = fptrunc float %result to bfloat
+ store volatile bfloat %cvt.result, ptr addrspace(1) undef
+ %max = call bfloat @llvm.maxnum.bf16(bfloat %cvt.result, bfloat 0.0)
+ %clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
+ %vec.result = insertelement <2 x bfloat> undef, bfloat %clamp, i32 1
+ ret <2 x bfloat> %vec.result
+}
+
+declare bfloat @llvm.minnum.bf16(bfloat, bfloat) #1
+declare bfloat @llvm.maxnum.bf16(bfloat, bfloat) #1
+declare float @llvm.minnum.f32(float, float) #1
+declare float @llvm.maxnum.f32(float, float) #1
+declare float @llvm.fmuladd.f32(float, float, float) #1
+declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #1
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
+attributes #1 = { nounwind readnone speculatable }
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll
new file mode 100644
index 0000000..1b2eb83
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll
@@ -0,0 +1,512 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
+
+define bfloat @mixlo_simple(float %src0, float %src1, float %src2) #0 {
+; GFX1250-LABEL: mixlo_simple:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2)
+ %cvt.result = fptrunc float %result to bfloat
+ ret bfloat %cvt.result
+}
+
+define bfloat @mixlo_simpl_no_flush(float %src0, float %src1, float %src2) {
+; GFX1250-LABEL: mixlo_simpl_no_flush:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2)
+ %cvt.result = fptrunc float %result to bfloat
+ ret bfloat %cvt.result
+}
+
+define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_bf16lo(bfloat %src0, bfloat %src1, bfloat %src2) #0 {
+; GFX1250-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_bf16lo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ %cvt.result = fptrunc float %result to bfloat
+ ret bfloat %cvt.result
+}
+
+define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_bf16lo_no_flush(bfloat %src0, bfloat %src1, bfloat %src2) {
+; GFX1250-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_bf16lo_no_flush:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %src2.ext = fpext bfloat %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ %cvt.result = fptrunc float %result to bfloat
+ ret bfloat %cvt.result
+}
+
+define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_f32(bfloat %src0, bfloat %src1, float %src2) #0 {
+; GFX1250-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
+ %cvt.result = fptrunc float %result to bfloat
+ ret bfloat %cvt.result
+}
+
+define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_post_cvt(bfloat %src0, bfloat %src1, float %src2) #0 {
+; GFX1250-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_post_cvt:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
+ %cvt.result = fptrunc float %result to bfloat
+ %max = call bfloat @llvm.maxnum.bf16(bfloat %cvt.result, bfloat 0.0)
+ %clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
+ ret bfloat %clamp
+}
+
+define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_pre_cvt(bfloat %src0, bfloat %src1, float %src2) #0 {
+; GFX1250-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_pre_cvt:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext bfloat %src0 to float
+ %src1.ext = fpext bfloat %src1 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
+ %max = call float @llvm.maxnum.f32(float %result, float 0.0)
+ %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+ %cvt.result = fptrunc float %clamp to bfloat
+ ret bfloat %cvt.result
+}
+
+
+define <2 x bfloat> @v_mad_mix_v2f32(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1
+; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
+ %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
+ %src2.ext = fpext <2 x bfloat> %src2 to <2 x float>
+ %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
+ %cvt.result = fptrunc <2 x float> %result to <2 x bfloat>
+ ret <2 x bfloat> %cvt.result
+}
+
+define <3 x bfloat> @v_mad_mix_v3f32(<3 x bfloat> %src0, <3 x bfloat> %src1, <3 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v3f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: v_fma_mixlo_bf16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_fma_mixhi_bf16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX1250-NEXT: v_mov_b32_e32 v0, v6
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <3 x bfloat> %src0 to <3 x float>
+ %src1.ext = fpext <3 x bfloat> %src1 to <3 x float>
+ %src2.ext = fpext <3 x bfloat> %src2 to <3 x float>
+ %result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext)
+ %cvt.result = fptrunc <3 x float> %result to <3 x bfloat>
+ ret <3 x bfloat> %cvt.result
+}
+
+define <4 x bfloat> @v_mad_mix_v4f32(<4 x bfloat> %src0, <4 x bfloat> %src1, <4 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v4f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX1250-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
+; GFX1250-NEXT: v_lshlrev_b32_e32 v8, 16, v3
+; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX1250-NEXT: v_and_b32_e32 v11, 0xffff0000, v5
+; GFX1250-NEXT: v_and_b32_e32 v13, 0xffff0000, v4
+; GFX1250-NEXT: v_dual_lshlrev_b32 v12, 16, v4 :: v_dual_lshlrev_b32 v10, 16, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[12:13]
+; GFX1250-NEXT: v_pk_fma_f32 v[2:3], v[6:7], v[8:9], v[10:11]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <4 x bfloat> %src0 to <4 x float>
+ %src1.ext = fpext <4 x bfloat> %src1 to <4 x float>
+ %src2.ext = fpext <4 x bfloat> %src2 to <4 x float>
+ %result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext)
+ %cvt.result = fptrunc <4 x float> %result to <4 x bfloat>
+ ret <4 x bfloat> %cvt.result
+}
+
+
+define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32_clamp_postcvt:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1
+; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
+ %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
+ %src2.ext = fpext <2 x bfloat> %src2 to <2 x float>
+ %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
+ %cvt.result = fptrunc <2 x float> %result to <2 x bfloat>
+ %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %cvt.result, <2 x bfloat> zeroinitializer)
+ %clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> <bfloat 1.0, bfloat 1.0>)
+ ret <2 x bfloat> %clamp
+}
+
+
+define <3 x bfloat> @v_mad_mix_v3f32_clamp_postcvt(<3 x bfloat> %src0, <3 x bfloat> %src1, <3 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v3f32_clamp_postcvt:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT: v_fma_mixlo_bf16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_fma_mixhi_bf16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v1 clamp
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT: v_mov_b32_e32 v0, v6
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <3 x bfloat> %src0 to <3 x float>
+ %src1.ext = fpext <3 x bfloat> %src1 to <3 x float>
+ %src2.ext = fpext <3 x bfloat> %src2 to <3 x float>
+ %result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext)
+ %cvt.result = fptrunc <3 x float> %result to <3 x bfloat>
+ %max = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %cvt.result, <3 x bfloat> zeroinitializer)
+ %clamp = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %max, <3 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0>)
+ ret <3 x bfloat> %clamp
+}
+
+define <4 x bfloat> @v_mad_mix_v4f32_clamp_postcvt(<4 x bfloat> %src0, <4 x bfloat> %src1, <4 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v6, 16, v0 :: v_dual_lshlrev_b32 v8, 16, v1
+; GFX1250-NEXT: v_and_b32_e32 v9, 0xffff0000, v1
+; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v2 :: v_dual_lshlrev_b32 v10, 16, v3
+; GFX1250-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
+; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v4 :: v_dual_lshlrev_b32 v12, 16, v5
+; GFX1250-NEXT: v_and_b32_e32 v13, 0xffff0000, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[6:7], v[0:1], v[2:3]
+; GFX1250-NEXT: v_pk_fma_f32 v[2:3], v[8:9], v[10:11], v[12:13]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
+; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v1 clamp
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <4 x bfloat> %src0 to <4 x float>
+ %src1.ext = fpext <4 x bfloat> %src1 to <4 x float>
+ %src2.ext = fpext <4 x bfloat> %src2 to <4 x float>
+ %result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext)
+ %cvt.result = fptrunc <4 x float> %result to <4 x bfloat>
+ %max = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %cvt.result, <4 x bfloat> zeroinitializer)
+ %clamp = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %max, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>)
+ ret <4 x bfloat> %clamp
+}
+
+define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1
+; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_max_num_bf16 v1, v0, v0 clamp
+; GFX1250-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
+ %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
+ %src2.ext = fpext <2 x bfloat> %src2 to <2 x float>
+ %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
+ %cvt.result = fptrunc <2 x float> %result to <2 x bfloat>
+ %cvt.lo = extractelement <2 x bfloat> %cvt.result, i32 0
+ %max.lo = call bfloat @llvm.maxnum.bf16(bfloat %cvt.lo, bfloat 0.0)
+ %clamp.lo = call bfloat @llvm.minnum.bf16(bfloat %max.lo, bfloat 1.0)
+ %insert = insertelement <2 x bfloat> %cvt.result, bfloat %clamp.lo, i32 0
+ ret <2 x bfloat> %insert
+}
+
+define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1
+; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v1 clamp
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
+ %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
+ %src2.ext = fpext <2 x bfloat> %src2 to <2 x float>
+ %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
+ %cvt.result = fptrunc <2 x float> %result to <2 x bfloat>
+ %cvt.hi = extractelement <2 x bfloat> %cvt.result, i32 1
+ %max.hi = call bfloat @llvm.maxnum.bf16(bfloat %cvt.hi, bfloat 0.0)
+ %clamp.hi = call bfloat @llvm.minnum.bf16(bfloat %max.hi, bfloat 1.0)
+ %insert = insertelement <2 x bfloat> %cvt.result, bfloat %clamp.hi, i32 1
+ ret <2 x bfloat> %insert
+}
+
+
+define <2 x bfloat> @v_mad_mix_v2f32_clamp_precvt(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v2f32_clamp_precvt:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1
+; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
+; GFX1250-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_max_num_f32_e64 v0, v0, v0 clamp
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
+ %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
+ %src2.ext = fpext <2 x bfloat> %src2 to <2 x float>
+ %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
+ %max = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %result, <2 x float> zeroinitializer)
+ %clamp = call <2 x float> @llvm.minnum.v2f32(<2 x float> %max, <2 x float> <float 1.0, float 1.0>)
+ %cvt.result = fptrunc <2 x float> %clamp to <2 x bfloat>
+ ret <2 x bfloat> %cvt.result
+}
+
+
+define <3 x bfloat> @v_mad_mix_v3f32_clamp_precvt(<3 x bfloat> %src0, <3 x bfloat> %src1, <3 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v3f32_clamp_precvt:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT: v_fma_mix_f32_bf16 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <3 x bfloat> %src0 to <3 x float>
+ %src1.ext = fpext <3 x bfloat> %src1 to <3 x float>
+ %src2.ext = fpext <3 x bfloat> %src2 to <3 x float>
+ %result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext)
+ %max = call <3 x float> @llvm.maxnum.v3f32(<3 x float> %result, <3 x float> zeroinitializer)
+ %clamp = call <3 x float> @llvm.minnum.v3f32(<3 x float> %max, <3 x float> <float 1.0, float 1.0, float 1.0>)
+ %cvt.result = fptrunc <3 x float> %clamp to <3 x bfloat>
+ ret <3 x bfloat> %cvt.result
+}
+
+define <4 x bfloat> @v_mad_mix_v4f32_clamp_precvt(<4 x bfloat> %src0, <4 x bfloat> %src1, <4 x bfloat> %src2) #0 {
+; GFX1250-LABEL: v_mad_mix_v4f32_clamp_precvt:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v6, 16, v0 :: v_dual_lshlrev_b32 v8, 16, v1
+; GFX1250-NEXT: v_and_b32_e32 v9, 0xffff0000, v1
+; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v2 :: v_dual_lshlrev_b32 v10, 16, v3
+; GFX1250-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
+; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX1250-NEXT: v_and_b32_e32 v13, 0xffff0000, v5
+; GFX1250-NEXT: v_dual_lshlrev_b32 v12, 16, v5 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_pk_fma_f32 v[4:5], v[8:9], v[10:11], v[12:13]
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[6:7], v[0:1], v[2:3]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_max_num_f32_e64 v2, v5, v5 clamp
+; GFX1250-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_max_num_f32_e64 v0, v0, v0 clamp
+; GFX1250-NEXT: v_max_num_f32_e64 v3, v4, v4 clamp
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v3, v2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %src0.ext = fpext <4 x bfloat> %src0 to <4 x float>
+ %src1.ext = fpext <4 x bfloat> %src1 to <4 x float>
+ %src2.ext = fpext <4 x bfloat> %src2 to <4 x float>
+ %result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext)
+ %max = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %result, <4 x float> zeroinitializer)
+ %clamp = call <4 x float> @llvm.minnum.v4f32(<4 x float> %max, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+ %cvt.result = fptrunc <4 x float> %clamp to <4 x bfloat>
+ ret <4 x bfloat> %cvt.result
+}
+
+define i32 @mixlo_zext(float %src0, float %src1, float %src2) #0 {
+; GFX1250-LABEL: mixlo_zext:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2)
+ %cvt.result = fptrunc float %result to bfloat
+ %cvt.result.i16 = bitcast bfloat %cvt.result to i16
+ %cvt.result.i32 = zext i16 %cvt.result.i16 to i32
+ ret i32 %cvt.result.i32
+}
+
+define bfloat @mixlo_fptrunc(float %a, float %b) #0 {
+; GFX1250-LABEL: mixlo_fptrunc:
+; GFX1250: ; %bb.0: ; %.entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, 0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+.entry:
+ %mul = fmul float %a, %b
+ %trunc = fptrunc float %mul to bfloat
+ ret bfloat %trunc
+}
+
+define bfloat @mixlo_fptrunc_no_flush(float %a, float %b) {
+; GFX1250-LABEL: mixlo_fptrunc_no_flush:
+; GFX1250: ; %bb.0: ; %.entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, 0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+.entry:
+ %mul = fmul float %a, %b
+ %trunc = fptrunc float %mul to bfloat
+ ret bfloat %trunc
+}
+
+define bfloat @mixlo_fptrunc_abs_src_mod(float %a, float %b) #0 {
+; GFX1250-LABEL: mixlo_fptrunc_abs_src_mod:
+; GFX1250: ; %bb.0: ; %.entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, |v0|, v1, 0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+.entry:
+ %a.fabs = call float @llvm.fabs.f32(float %a)
+ %mul = fmul float %a.fabs, %b
+ %trunc = fptrunc float %mul to bfloat
+ ret bfloat %trunc
+}
+
+define bfloat @mixlo_fptrunc_neg_src_mod(float %a, float %b) #0 {
+; GFX1250-LABEL: mixlo_fptrunc_neg_src_mod:
+; GFX1250: ; %bb.0: ; %.entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, -v0, v1, 0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+.entry:
+ %a.fneg = fneg float %a
+ %mul = fmul float %a.fneg, %b
+ %trunc = fptrunc float %mul to bfloat
+ ret bfloat %trunc
+}
+
+declare float @llvm.fabs.f32(float) #1
+
+declare bfloat @llvm.minnum.bf16(bfloat, bfloat) #1
+declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat>, <2 x bfloat>) #1
+declare <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat>, <3 x bfloat>) #1
+declare <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat>, <4 x bfloat>) #1
+
+declare bfloat @llvm.maxnum.bf16(bfloat, bfloat) #1
+declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat>, <2 x bfloat>) #1
+declare <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat>, <3 x bfloat>) #1
+declare <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat>, <4 x bfloat>) #1
+
+declare float @llvm.minnum.f32(float, float) #1
+declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #1
+declare <3 x float> @llvm.minnum.v3f32(<3 x float>, <3 x float>) #1
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #1
+
+declare float @llvm.maxnum.f32(float, float) #1
+declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #1
+declare <3 x float> @llvm.maxnum.v3f32(<3 x float>, <3 x float>) #1
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #1
+
+declare float @llvm.fmuladd.f32(float, float, float) #1
+declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #1
+declare <3 x float> @llvm.fmuladd.v3f32(<3 x float>, <3 x float>, <3 x float>) #1
+declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #1
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
+attributes #1 = { nounwind readnone speculatable }
diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
index 9cc42ac..be02045 100644
--- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
@@ -9851,8 +9851,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
; CHECK-NEXT: s_andn2_saveexec_b32 s6, s6
; CHECK-NEXT: s_cbranch_execz .LBB8_6
; CHECK-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader
-; CHECK-NEXT: v_add_nc_u32_e32 v0, 0x700, v0
; CHECK-NEXT: v_add_nc_u32_e32 v1, 0x700, v1
+; CHECK-NEXT: v_add_nc_u32_e32 v0, 0x700, v0
; CHECK-NEXT: s_movk_i32 s4, 0xf800
; CHECK-NEXT: s_mov_b32 s5, -1
; CHECK-NEXT: .LBB8_5: ; %memmove_bwd_loop
@@ -11167,8 +11167,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_andn2_saveexec_b32 s6, s6
; ALIGNED-NEXT: s_cbranch_execz .LBB8_6
; ALIGNED-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader
-; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0x700, v0
; ALIGNED-NEXT: v_add_nc_u32_e32 v1, 0x700, v1
+; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0x700, v0
; ALIGNED-NEXT: s_movk_i32 s4, 0xf800
; ALIGNED-NEXT: s_mov_b32 s5, -1
; ALIGNED-NEXT: .LBB8_5: ; %memmove_bwd_loop
@@ -12381,8 +12381,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
; UNROLL3-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:2024
; UNROLL3-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:2020
; UNROLL3-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:2016
-; UNROLL3-NEXT: v_add_nc_u32_e32 v2, 0x7b0, v0
; UNROLL3-NEXT: v_add_nc_u32_e32 v1, 0x7b0, v1
+; UNROLL3-NEXT: v_add_nc_u32_e32 v2, 0x7b0, v0
; UNROLL3-NEXT: s_waitcnt vmcnt(3)
; UNROLL3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:2028
; UNROLL3-NEXT: s_waitcnt vmcnt(2)
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
index 1379eb6..80445f7 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
@@ -79,7 +79,7 @@ define amdgpu_kernel void @workgroup_acquire_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup") acquire, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("workgroup") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -146,7 +146,7 @@ define amdgpu_kernel void @workgroup_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup") release, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("workgroup") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -218,7 +218,7 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -290,7 +290,7 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -360,7 +360,7 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -427,7 +427,7 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup-one-as") release, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("workgroup-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -499,7 +499,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup-one-as") acq_rel, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("workgroup-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -571,7 +571,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup-one-as") seq_cst, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("workgroup-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -663,7 +663,7 @@ define amdgpu_kernel void @agent_acquire_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent") acquire, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("agent") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -745,7 +745,7 @@ define amdgpu_kernel void @agent_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent") release, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("agent") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -843,7 +843,7 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -941,7 +941,7 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -1033,7 +1033,7 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent-one-as") acquire, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("agent-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -1115,7 +1115,7 @@ define amdgpu_kernel void @agent_one_as_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent-one-as") release, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("agent-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -1213,7 +1213,7 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent-one-as") acq_rel, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("agent-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -1311,7 +1311,7 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent-one-as") seq_cst, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("agent-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -1405,7 +1405,7 @@ define amdgpu_kernel void @system_acquire_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
entry:
- fence acquire, !mmra !{!"amdgpu-as", !"global"}
+ fence acquire, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -1491,7 +1491,7 @@ define amdgpu_kernel void @system_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
entry:
- fence release, !mmra !{!"amdgpu-as", !"global"}
+ fence release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -1595,7 +1595,7 @@ define amdgpu_kernel void @system_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
entry:
- fence acq_rel, !mmra !{!"amdgpu-as", !"global"}
+ fence acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -1699,7 +1699,7 @@ define amdgpu_kernel void @system_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
entry:
- fence seq_cst, !mmra !{!"amdgpu-as", !"global"}
+ fence seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -1793,7 +1793,7 @@ define amdgpu_kernel void @system_one_as_acquire_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("one-as") acquire, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -1879,7 +1879,7 @@ define amdgpu_kernel void @system_one_as_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("one-as") release, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -1983,7 +1983,7 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("one-as") acq_rel, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
@@ -2087,6 +2087,6 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("one-as") seq_cst, !mmra !{!"amdgpu-as", !"global"}
+ fence syncscope("one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll
index 971015b..7a419a5 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll
@@ -77,7 +77,7 @@ define amdgpu_kernel void @workgroup_acquire_fence() {
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup") acquire, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("workgroup") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -143,7 +143,7 @@ define amdgpu_kernel void @workgroup_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup") release, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("workgroup") release, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -209,7 +209,7 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -275,7 +275,7 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -332,7 +332,7 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -389,7 +389,7 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup-one-as") release, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("workgroup-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -446,7 +446,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup-one-as") acq_rel, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("workgroup-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -503,7 +503,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("workgroup-one-as") seq_cst, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("workgroup-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -571,7 +571,7 @@ define amdgpu_kernel void @agent_acquire_fence() {
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent") acquire, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("agent") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -637,7 +637,7 @@ define amdgpu_kernel void @agent_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent") release, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("agent") release, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -703,7 +703,7 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -769,7 +769,7 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -826,7 +826,7 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent-one-as") acquire, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("agent-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -883,7 +883,7 @@ define amdgpu_kernel void @agent_one_as_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent-one-as") release, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("agent-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -940,7 +940,7 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent-one-as") acq_rel, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("agent-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -997,7 +997,7 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("agent-one-as") seq_cst, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("agent-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -1065,7 +1065,7 @@ define amdgpu_kernel void @system_acquire_fence() {
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
entry:
- fence acquire, !mmra !{!"amdgpu-as", !"local"}
+ fence acquire, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -1131,7 +1131,7 @@ define amdgpu_kernel void @system_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence release, !mmra !{!"amdgpu-as", !"local"}
+ fence release, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -1197,7 +1197,7 @@ define amdgpu_kernel void @system_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence acq_rel, !mmra !{!"amdgpu-as", !"local"}
+ fence acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -1263,7 +1263,7 @@ define amdgpu_kernel void @system_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence seq_cst, !mmra !{!"amdgpu-as", !"local"}
+ fence seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -1320,7 +1320,7 @@ define amdgpu_kernel void @system_one_as_acquire_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("one-as") acquire, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -1377,7 +1377,7 @@ define amdgpu_kernel void @system_one_as_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("one-as") release, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -1434,7 +1434,7 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("one-as") acq_rel, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
@@ -1491,6 +1491,6 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
entry:
- fence syncscope("one-as") seq_cst, !mmra !{!"amdgpu-as", !"local"}
+ fence syncscope("one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index bc25084..5e5e3bf 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -415,11 +415,6 @@ define amdgpu_kernel void @local_volatile_store_0(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
-; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: ds_store_b32 v0, v1
; GFX12-WGP-NEXT: s_endpgm
;
@@ -432,11 +427,6 @@ define amdgpu_kernel void @local_volatile_store_0(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
-; GFX12-CU-NEXT: s_wait_samplecnt 0x0
-; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(3) %out) {
@@ -562,11 +552,6 @@ define amdgpu_kernel void @local_volatile_store_1(
; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
-; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: ds_store_b32 v0, v1
; GFX12-WGP-NEXT: s_endpgm
;
@@ -583,11 +568,6 @@ define amdgpu_kernel void @local_volatile_store_1(
; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
-; GFX12-CU-NEXT: s_wait_samplecnt 0x0
-; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(3) %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
index 6763957..f7aaa3e 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
@@ -1,15 +1,148 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX90A %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX90A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX90A %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX90A %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefixes=GCN,GFX908 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX90A %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX90A %s
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_vgpr:
-; GFX908: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
-; GFX90A: v_mfma_f32_32x32x1{{.*}} v[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, v[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vgpr(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_f32_32x32x1f32_vgpr:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX908-NEXT: v_mov_b32_e32 v4, 0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX908-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v0, s16
+; GFX908-NEXT: v_mov_b32_e32 v1, s17
+; GFX908-NEXT: v_mov_b32_e32 v2, s18
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s21
+; GFX908-NEXT: v_mov_b32_e32 v1, s22
+; GFX908-NEXT: v_mov_b32_e32 v2, s23
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s24
+; GFX908-NEXT: v_mov_b32_e32 v1, s25
+; GFX908-NEXT: v_mov_b32_e32 v2, s26
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s27
+; GFX908-NEXT: v_mov_b32_e32 v1, s28
+; GFX908-NEXT: v_mov_b32_e32 v2, s29
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s30
+; GFX908-NEXT: v_mov_b32_e32 v1, s31
+; GFX908-NEXT: v_mov_b32_e32 v2, s0
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s1
+; GFX908-NEXT: v_mov_b32_e32 v1, s2
+; GFX908-NEXT: v_mov_b32_e32 v2, s3
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s4
+; GFX908-NEXT: v_mov_b32_e32 v1, s5
+; GFX908-NEXT: v_mov_b32_e32 v2, s6
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s7
+; GFX908-NEXT: v_mov_b32_e32 v1, s8
+; GFX908-NEXT: v_mov_b32_e32 v2, s9
+; GFX908-NEXT: v_mov_b32_e32 v3, s19
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s10
+; GFX908-NEXT: v_mov_b32_e32 v1, s11
+; GFX908-NEXT: v_mov_b32_e32 v2, s12
+; GFX908-NEXT: v_mov_b32_e32 v5, s20
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s13
+; GFX908-NEXT: v_mov_b32_e32 v1, s14
+; GFX908-NEXT: v_mov_b32_e32 v2, s15
+; GFX908-NEXT: v_mov_b32_e32 v3, 1.0
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a31, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
+; GFX908-NEXT: s_endpgm
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
@@ -17,9 +150,142 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_agpr:
-; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(ptr addrspace(1) %arg) #2 {
+; GFX908-LABEL: test_mfma_f32_32x32x1f32_agpr:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX908-NEXT: v_mov_b32_e32 v4, 0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX908-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v0, s16
+; GFX908-NEXT: v_mov_b32_e32 v1, s17
+; GFX908-NEXT: v_mov_b32_e32 v2, s18
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s21
+; GFX908-NEXT: v_mov_b32_e32 v1, s22
+; GFX908-NEXT: v_mov_b32_e32 v2, s23
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s24
+; GFX908-NEXT: v_mov_b32_e32 v1, s25
+; GFX908-NEXT: v_mov_b32_e32 v2, s26
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s27
+; GFX908-NEXT: v_mov_b32_e32 v1, s28
+; GFX908-NEXT: v_mov_b32_e32 v2, s29
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s30
+; GFX908-NEXT: v_mov_b32_e32 v1, s31
+; GFX908-NEXT: v_mov_b32_e32 v2, s0
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s1
+; GFX908-NEXT: v_mov_b32_e32 v1, s2
+; GFX908-NEXT: v_mov_b32_e32 v2, s3
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s4
+; GFX908-NEXT: v_mov_b32_e32 v1, s5
+; GFX908-NEXT: v_mov_b32_e32 v2, s6
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s7
+; GFX908-NEXT: v_mov_b32_e32 v1, s8
+; GFX908-NEXT: v_mov_b32_e32 v2, s9
+; GFX908-NEXT: v_mov_b32_e32 v3, s19
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s10
+; GFX908-NEXT: v_mov_b32_e32 v1, s11
+; GFX908-NEXT: v_mov_b32_e32 v2, s12
+; GFX908-NEXT: v_mov_b32_e32 v5, s20
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s13
+; GFX908-NEXT: v_mov_b32_e32 v1, s14
+; GFX908-NEXT: v_mov_b32_e32 v2, s15
+; GFX908-NEXT: v_mov_b32_e32 v3, 1.0
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a31, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
+; GFX908-NEXT: s_endpgm
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
@@ -27,9 +293,105 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr:
-; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr addrspace(1) %arg) {
+; GFX908-LABEL: test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX908-NEXT: v_mov_b32_e32 v32, 0
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ; def a0
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; GFX908-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
+; GFX908-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX908-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; GFX908-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GFX908-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
+; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GFX908-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v6
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v7
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v8
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v9
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v10
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v11
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v12
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v13
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v14
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v15
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v16
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v17
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v18
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v19
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v20
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v21
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v22
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v23
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v24
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v25
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v26
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v27
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v28
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v29
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v30
+; GFX908-NEXT: v_accvgpr_write_b32 a31, v31
+; GFX908-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX908-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a0
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112
+; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64
+; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80
+; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
+; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:48
+; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1]
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
+; GFX908-NEXT: s_endpgm
bb:
%acc = call i32 asm sideeffect "; def $0", "={a0}"()
%in.1 = load <32 x float>, ptr addrspace(1) %arg
@@ -38,9 +400,105 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_inline_asm_phys_agpr:
-; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr addrspace(1) %arg) {
+; GFX908-LABEL: test_mfma_f32_32x32x1f32_inline_asm_phys_agpr:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX908-NEXT: v_mov_b32_e32 v32, 0
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ; use a[100:131]
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; GFX908-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
+; GFX908-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX908-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; GFX908-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GFX908-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
+; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GFX908-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v6
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v7
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v8
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v9
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v10
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v11
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v12
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v13
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v14
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v15
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v16
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v17
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v18
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v19
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v20
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v21
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v22
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v23
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v24
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v25
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v26
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v27
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v28
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v29
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v30
+; GFX908-NEXT: v_accvgpr_write_b32 a31, v31
+; GFX908-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX908-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a0
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112
+; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64
+; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80
+; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
+; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:48
+; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1]
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
+; GFX908-NEXT: s_endpgm
bb:
call void asm sideeffect "; use $0", "{a[100:131]}"(<32 x float> poison)
%in.1 = load <32 x float>, ptr addrspace(1) %arg
@@ -49,10 +507,105 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_inline_asm_no_agprs:
-; GFX908: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
-; GFX90A: v_mfma_f32_32x32x1{{.*}} v[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, v[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_no_agprs(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_f32_32x32x1f32_inline_asm_no_agprs:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX908-NEXT: v_mov_b32_e32 v32, 0
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ; def v0
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; GFX908-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
+; GFX908-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX908-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; GFX908-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GFX908-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
+; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GFX908-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v6
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v7
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v8
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v9
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v10
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v11
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v12
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v13
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v14
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v15
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v16
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v17
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v18
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v19
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v20
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v21
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v22
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v23
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v24
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v25
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v26
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v27
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v28
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v29
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v30
+; GFX908-NEXT: v_accvgpr_write_b32 a31, v31
+; GFX908-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX908-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a0
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112
+; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64
+; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80
+; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
+; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:48
+; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1]
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
+; GFX908-NEXT: s_endpgm
bb:
%acc = call i32 asm sideeffect "; def $0", "={v0}"()
%in.1 = load <32 x float>, ptr addrspace(1) %arg
@@ -61,9 +614,127 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_call:
-; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg) #1 {
+; GFX908-LABEL: test_mfma_f32_32x32x1f32_call:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX908-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX908-NEXT: s_mov_b32 s38, -1
+; GFX908-NEXT: s_mov_b32 s39, 0xe00000
+; GFX908-NEXT: s_add_u32 s36, s36, s11
+; GFX908-NEXT: s_addc_u32 s37, s37, 0
+; GFX908-NEXT: s_mov_b32 s12, s8
+; GFX908-NEXT: s_add_u32 s8, s4, 44
+; GFX908-NEXT: s_mov_b32 s13, s9
+; GFX908-NEXT: s_addc_u32 s9, s5, 0
+; GFX908-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX908-NEXT: s_getpc_b64 s[4:5]
+; GFX908-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4
+; GFX908-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12
+; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX908-NEXT: s_mov_b32 s14, s10
+; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX908-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX908-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX908-NEXT: s_mov_b32 s32, 0
+; GFX908-NEXT: v_mov_b32_e32 v40, 0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX908-NEXT: global_load_dwordx4 v[28:31], v40, s[34:35] offset:112
+; GFX908-NEXT: global_load_dwordx4 v[24:27], v40, s[34:35] offset:96
+; GFX908-NEXT: global_load_dwordx4 v[20:23], v40, s[34:35] offset:80
+; GFX908-NEXT: global_load_dwordx4 v[16:19], v40, s[34:35] offset:64
+; GFX908-NEXT: global_load_dwordx4 v[12:15], v40, s[34:35] offset:48
+; GFX908-NEXT: global_load_dwordx4 v[8:11], v40, s[34:35] offset:32
+; GFX908-NEXT: global_load_dwordx4 v[4:7], v40, s[34:35] offset:16
+; GFX908-NEXT: global_load_dwordx4 v[0:3], v40, s[34:35]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v6
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v7
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v8
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v9
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v10
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v11
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v12
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v13
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v14
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v15
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v16
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v17
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v18
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v19
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v20
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v21
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v22
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v23
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v24
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v25
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v26
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v27
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v28
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v29
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v30
+; GFX908-NEXT: v_accvgpr_write_b32 a31, v31
+; GFX908-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX908-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a0
+; GFX908-NEXT: global_store_dwordx4 v40, v[0:3], s[34:35] offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: global_store_dwordx4 v40, v[4:7], s[34:35] offset:112
+; GFX908-NEXT: global_store_dwordx4 v40, v[8:11], s[34:35] offset:64
+; GFX908-NEXT: global_store_dwordx4 v40, v[12:15], s[34:35] offset:80
+; GFX908-NEXT: global_store_dwordx4 v40, v[16:19], s[34:35] offset:32
+; GFX908-NEXT: global_store_dwordx4 v40, v[20:23], s[34:35] offset:48
+; GFX908-NEXT: global_store_dwordx4 v40, v[24:27], s[34:35]
+; GFX908-NEXT: global_store_dwordx4 v40, v[0:3], s[34:35] offset:16
+; GFX908-NEXT: s_endpgm
bb:
call void @foo()
%in.1 = load <32 x float>, ptr addrspace(1) %arg
@@ -75,10 +746,173 @@ bb:
; We could avoid scan to find calls since we see these during lowering before selection.
; However, in SDag lowering and selection is done block by block, so it would only work
; in Global ISel.
-
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_call_multi_bb:
-; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace(1) %arg, i1 %c0) #1 {
+; GFX908-LABEL: test_mfma_f32_32x32x1f32_call_multi_bb:
+; GFX908: ; %bb.0: ; %bb1
+; GFX908-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0
+; GFX908-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1
+; GFX908-NEXT: s_mov_b32 s54, -1
+; GFX908-NEXT: s_mov_b32 s55, 0xe00000
+; GFX908-NEXT: s_add_u32 s52, s52, s11
+; GFX908-NEXT: s_mov_b32 s14, s10
+; GFX908-NEXT: s_mov_b32 s12, s8
+; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX908-NEXT: s_load_dword s8, s[4:5], 0x2c
+; GFX908-NEXT: v_mov_b32_e32 v6, 1.0
+; GFX908-NEXT: v_mov_b32_e32 v7, 0
+; GFX908-NEXT: s_addc_u32 s53, s53, 0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x0
+; GFX908-NEXT: s_load_dwordx16 s[16:31], s[6:7], 0x40
+; GFX908-NEXT: s_bitcmp0_b32 s8, 0
+; GFX908-NEXT: s_mov_b32 s32, 0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v3, s36
+; GFX908-NEXT: v_mov_b32_e32 v4, s37
+; GFX908-NEXT: v_mov_b32_e32 v5, s40
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v4
+; GFX908-NEXT: v_mov_b32_e32 v3, s38
+; GFX908-NEXT: v_mov_b32_e32 v4, s39
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v4
+; GFX908-NEXT: v_mov_b32_e32 v3, s41
+; GFX908-NEXT: v_mov_b32_e32 v4, s42
+; GFX908-NEXT: v_mov_b32_e32 v5, s43
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v5
+; GFX908-NEXT: v_mov_b32_e32 v3, s44
+; GFX908-NEXT: v_mov_b32_e32 v4, s45
+; GFX908-NEXT: v_mov_b32_e32 v5, s46
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v5
+; GFX908-NEXT: v_mov_b32_e32 v3, s47
+; GFX908-NEXT: v_mov_b32_e32 v4, s48
+; GFX908-NEXT: v_mov_b32_e32 v5, s49
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v5
+; GFX908-NEXT: v_mov_b32_e32 v3, s50
+; GFX908-NEXT: v_mov_b32_e32 v4, s51
+; GFX908-NEXT: v_mov_b32_e32 v5, s16
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v5
+; GFX908-NEXT: v_mov_b32_e32 v3, s17
+; GFX908-NEXT: v_mov_b32_e32 v4, s18
+; GFX908-NEXT: v_mov_b32_e32 v5, s19
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v5
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
+; GFX908-NEXT: v_mov_b32_e32 v4, s21
+; GFX908-NEXT: v_mov_b32_e32 v5, s22
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v5
+; GFX908-NEXT: v_mov_b32_e32 v3, s23
+; GFX908-NEXT: v_mov_b32_e32 v4, s24
+; GFX908-NEXT: v_mov_b32_e32 v5, s25
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v5
+; GFX908-NEXT: v_mov_b32_e32 v3, s26
+; GFX908-NEXT: v_mov_b32_e32 v4, s27
+; GFX908-NEXT: v_mov_b32_e32 v5, s28
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v5
+; GFX908-NEXT: v_mov_b32_e32 v3, s29
+; GFX908-NEXT: v_mov_b32_e32 v4, s30
+; GFX908-NEXT: v_mov_b32_e32 v5, s31
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a31, v5
+; GFX908-NEXT: v_mov_b32_e32 v3, 2.0
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v6, v3, a[0:31] cbsz:1 abid:2 blgp:3
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a24
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a28
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:112
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a16
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:64
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a20
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:80
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a8
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:32
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a12
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:48
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7]
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a4
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:16
+; GFX908-NEXT: s_cbranch_scc1 .LBB6_2
+; GFX908-NEXT: ; %bb.1: ; %bb2
+; GFX908-NEXT: s_add_u32 s8, s4, 48
+; GFX908-NEXT: s_mov_b32 s13, s9
+; GFX908-NEXT: s_addc_u32 s9, s5, 0
+; GFX908-NEXT: s_getpc_b64 s[4:5]
+; GFX908-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4
+; GFX908-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12
+; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX908-NEXT: s_mov_b64 s[0:1], s[52:53]
+; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX908-NEXT: s_mov_b64 s[2:3], s[54:55]
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX908-NEXT: .LBB6_2: ; %bb3
+; GFX908-NEXT: s_endpgm
bb1:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
@@ -94,10 +928,101 @@ bb3:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_nonentry_noagpr:
-; GFX908: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
-; GFX90A: v_mfma_f32_32x32x1{{.*}} v[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, v[{{[0-9:]+}}]
define void @test_mfma_f32_32x32x1f32_nonentry_noagpr(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_f32_32x32x1f32_nonentry_noagpr:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112
+; GFX908-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96
+; GFX908-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80
+; GFX908-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64
+; GFX908-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48
+; GFX908-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32
+; GFX908-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
+; GFX908-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v6
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v7
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v8
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v9
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v10
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v11
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v12
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v13
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v14
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v15
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v16
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v17
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v18
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v19
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v20
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v21
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v22
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v23
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v24
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v25
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v26
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v27
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v28
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v29
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v30
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v31
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v32
+; GFX908-NEXT: v_accvgpr_write_b32 a31, v33
+; GFX908-NEXT: v_mov_b32_e32 v2, 1.0
+; GFX908-NEXT: v_mov_b32_e32 v3, 2.0
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a0
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a4
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:112
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:64
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:80
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[18:21], off offset:32
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[22:25], off offset:48
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[26:29], off
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: s_setpc_b64 s[30:31]
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
@@ -105,9 +1030,101 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_nonentry_with_agpr:
-; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
define void @test_mfma_f32_32x32x1f32_nonentry_with_agpr(ptr addrspace(1) %arg) #3 {
+; GFX908-LABEL: test_mfma_f32_32x32x1f32_nonentry_with_agpr:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112
+; GFX908-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96
+; GFX908-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80
+; GFX908-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64
+; GFX908-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48
+; GFX908-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32
+; GFX908-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
+; GFX908-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v6
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v7
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v8
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v9
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v10
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v11
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v12
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v13
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v14
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v15
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v16
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v17
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v18
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v19
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v20
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v21
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v22
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v23
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v24
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v25
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v26
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v27
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v28
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v29
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v30
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v31
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v32
+; GFX908-NEXT: v_accvgpr_write_b32 a31, v33
+; GFX908-NEXT: v_mov_b32_e32 v2, 1.0
+; GFX908-NEXT: v_mov_b32_e32 v3, 2.0
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a0
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a4
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:112
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:64
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:80
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[18:21], off offset:32
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[22:25], off offset:48
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[26:29], off
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: s_setpc_b64 s[30:31]
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
@@ -121,3 +1138,6 @@ attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2
attributes #1 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" }
attributes #2 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-agpr-alloc"="0" }
attributes #3 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
+; GFX90A: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 9c38d7f..5b0d2d2 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -3565,13 +3565,13 @@ define amdgpu_kernel void @fneg_v2f32_scalar(ptr addrspace(1) %a, <2 x float> %x
; PACKED-SDAG-LABEL: fneg_v2f32_scalar:
; PACKED-SDAG: ; %bb.0:
; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, 0
; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-SDAG-NEXT: s_xor_b32 s3, s3, 0x80000000
; PACKED-SDAG-NEXT: s_xor_b32 s2, s2, 0x80000000
-; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; PACKED-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; PACKED-SDAG-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
; PACKED-SDAG-NEXT: s_endpgm
;
; PACKED-GISEL-LABEL: fneg_v2f32_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index a5c8f04..f54a383 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --stop-after=greedy,1 < %s | FileCheck -check-prefix=REGALLOC-GFX908 %s
+;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --stop-after=greedy,2 < %s | FileCheck -check-prefix=REGALLOC-GFX908 %s
;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --stop-after=prologepilog < %s | FileCheck -check-prefix=PEI-GFX908 %s
-;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --stop-after=greedy,1 < %s | FileCheck -check-prefix=REGALLOC-GFX90A %s
+;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --stop-after=greedy,2 < %s | FileCheck -check-prefix=REGALLOC-GFX90A %s
;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --stop-after=prologepilog < %s | FileCheck -check-prefix=PEI-GFX90A %s
; Partial reg copy and spill missed during regalloc handled later at frame lowering.
@@ -12,17 +12,21 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; REGALLOC-GFX908-NEXT: liveins: $sgpr4_sgpr5
; REGALLOC-GFX908-NEXT: {{ $}}
; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32
- ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:VReg_128 */, def %7
- ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %8
- ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64, %7, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
+ ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:VReg_128 */, def %25
+ ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:av_128 = COPY %25
+ ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %27
+ ; REGALLOC-GFX908-NEXT: SI_SPILL_AV64_SAVE %27, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+ ; REGALLOC-GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[COPY]]
+ ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
; REGALLOC-GFX908-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
- ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
+ ; REGALLOC-GFX908-NEXT: [[COPY2:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
; REGALLOC-GFX908-NEXT: [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
; REGALLOC-GFX908-NEXT: [[AV_MOV_1:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
- ; REGALLOC-GFX908-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
- ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %17:vreg_64, %8, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
- ; REGALLOC-GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
- ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %19:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
+ ; REGALLOC-GFX908-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; REGALLOC-GFX908-NEXT: [[SI_SPILL_AV64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
+ ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %17:vreg_64, [[SI_SPILL_AV64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+ ; REGALLOC-GFX908-NEXT: [[COPY3:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
+ ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %19:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
; REGALLOC-GFX908-NEXT: S_ENDPGM 0
;
; PEI-GFX908-LABEL: name: partial_copy
@@ -57,40 +61,35 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; REGALLOC-GFX90A-NEXT: liveins: $sgpr4_sgpr5
; REGALLOC-GFX90A-NEXT: {{ $}}
; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32
- ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def %7
- ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %8
- ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, %7, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
+ ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def %24
+ ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:av_128_align2 = COPY %24
+ ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %22
+ ; REGALLOC-GFX90A-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY %22
+ ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
; REGALLOC-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
- ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
+ ; REGALLOC-GFX90A-NEXT: [[COPY2:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
; REGALLOC-GFX90A-NEXT: [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
; REGALLOC-GFX90A-NEXT: [[AV_MOV_1:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
- ; REGALLOC-GFX90A-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
- ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef %17:vreg_64_align2, %8, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+ ; REGALLOC-GFX90A-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef %17:vreg_64_align2, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %19:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
; REGALLOC-GFX90A-NEXT: S_ENDPGM 0
;
; PEI-GFX90A-LABEL: name: partial_copy
; PEI-GFX90A: bb.0 (%ir-block.0):
- ; PEI-GFX90A-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9
+ ; PEI-GFX90A-NEXT: liveins: $sgpr4_sgpr5
; PEI-GFX90A-NEXT: {{ $}}
- ; PEI-GFX90A-NEXT: $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
- ; PEI-GFX90A-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
- ; PEI-GFX90A-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0
; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
- ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1
- ; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
- ; PEI-GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
+ ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
; PEI-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
; PEI-GFX90A-NEXT: renamable $vgpr0 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
; PEI-GFX90A-NEXT: renamable $vgpr1 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
- ; PEI-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
- ; PEI-GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
- ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+ ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
; PEI-GFX90A-NEXT: S_ENDPGM 0
call void asm sideeffect "; use $0", "a" (i32 poison)
diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
index d48bfe0..68ef30a9 100644
--- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
@@ -53,31 +53,31 @@ define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) {
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s20
-; GFX942-NEXT: v_mov_b32_e32 v1, s21
-; GFX942-NEXT: v_mov_b32_e32 v2, s22
-; GFX942-NEXT: v_mov_b32_e32 v3, s23
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
+; GFX942-NEXT: v_mov_b32_e32 v2, s20
+; GFX942-NEXT: v_mov_b32_e32 v3, s21
+; GFX942-NEXT: v_mov_b32_e32 v4, s22
+; GFX942-NEXT: v_mov_b32_e32 v5, s23
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mov_b32_e32 v0, s16
-; GFX942-NEXT: v_mov_b32_e32 v1, s17
-; GFX942-NEXT: v_mov_b32_e32 v2, s18
-; GFX942-NEXT: v_mov_b32_e32 v3, s19
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; GFX942-NEXT: v_mov_b32_e32 v2, s16
+; GFX942-NEXT: v_mov_b32_e32 v3, s17
+; GFX942-NEXT: v_mov_b32_e32 v4, s18
+; GFX942-NEXT: v_mov_b32_e32 v5, s19
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mov_b32_e32 v0, s12
-; GFX942-NEXT: v_mov_b32_e32 v1, s13
-; GFX942-NEXT: v_mov_b32_e32 v2, s14
-; GFX942-NEXT: v_mov_b32_e32 v3, s15
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX942-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-NEXT: v_mov_b32_e32 v3, s13
+; GFX942-NEXT: v_mov_b32_e32 v4, s14
+; GFX942-NEXT: v_mov_b32_e32 v5, s15
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mov_b32_e32 v0, s8
-; GFX942-NEXT: v_mov_b32_e32 v1, s9
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
-; GFX942-NEXT: v_mov_b32_e32 v3, s11
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v2, s8
+; GFX942-NEXT: v_mov_b32_e32 v3, s9
+; GFX942-NEXT: v_mov_b32_e32 v4, s10
+; GFX942-NEXT: v_mov_b32_e32 v5, s11
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
; GFX942-NEXT: s_endpgm
entry:
store <16 x i32> %a, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/read-write-register-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/read-write-register-illegal-type.ll
new file mode 100644
index 0000000..2324f3f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/read-write-register-illegal-type.ll
@@ -0,0 +1,29 @@
+; RUN: not llc -mtriple=amdgcn -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
+
+; CHECK: error: <unknown>:0:0: cannot use llvm.read_register with illegal type
+define amdgpu_kernel void @test_read_register_i9(ptr addrspace(1) %out) nounwind {
+ %reg = call i9 @llvm.read_register.i9(metadata !0)
+ store i9 %reg, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: error: <unknown>:0:0: cannot use llvm.write_register with illegal type
+define amdgpu_kernel void @test_write_register_i9(ptr addrspace(1) %out) nounwind {
+ call void @llvm.write_register.i9(metadata !0, i9 42)
+ ret void
+}
+
+; CHECK: error: <unknown>:0:0: cannot use llvm.read_register with illegal type
+define amdgpu_kernel void @test_read_register_i128(ptr addrspace(1) %out) nounwind {
+ %reg = call i128 @llvm.read_register.i128(metadata !0)
+ store i128 %reg, ptr addrspace(1) %out
+ ret void
+}
+
+; CHECK: error: <unknown>:0:0: cannot use llvm.write_register with illegal type
+define amdgpu_kernel void @test_write_register_i128(ptr addrspace(1) %out) nounwind {
+ call void @llvm.write_register.i128(metadata !0, i128 42)
+ ret void
+}
+
+!0 = !{!"m0"}
diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll b/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll
index f2fd3a8..c035e9f 100644
--- a/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll
@@ -9,9 +9,9 @@
%asm.output = type { <16 x i32>, <8 x i32>, <5 x i32>, <4 x i32>, <16 x i32> }
; CHECK-LABEL: {{^}}illegal_eviction_assert:
-; CHECK: ; def v[4:19] v[20:27] v[0:4] v[0:3] a[0:15]
+; CHECK: ; def v[13:28] v[0:7] v[8:12] v[0:3] a[0:15]
; CHECK: ; clobber
-; CHECK: ; use v[4:19] v[20:27] v[0:4] v[0:3] a[1:16]
+; CHECK: ; use v[13:28] v[0:7] v[8:12] v[0:3] a[1:16]
define void @illegal_eviction_assert(ptr addrspace(1) %arg) #0 {
;%agpr0 = call i32 asm sideeffect "; def $0","=${a0}"()
%asm = call %asm.output asm sideeffect "; def $0 $1 $2 $3 $4","=v,=v,=v,=v,={a[0:15]}"()
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
index e8e122e..bbb9df9 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
@@ -59,19 +59,19 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
; GFX90A-LABEL: scalar_to_vector_v8i16:
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4
+; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GFX90A-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX90A-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, s3
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v0, s0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
; GFX90A-NEXT: v_mov_b32_e32 v2, s0
-; GFX90A-NEXT: v_mov_b32_e32 v3, s0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX90A-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX90A-NEXT: v_mov_b32_e32 v3, s1
+; GFX90A-NEXT: v_mov_b32_e32 v4, s0
+; GFX90A-NEXT: v_mov_b32_e32 v5, s0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GFX90A-NEXT: s_endpgm
entry:
%val.1.i32 = extractelement <2 x i32> %in, i64 0
@@ -146,19 +146,19 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
; GFX90A-LABEL: scalar_to_vector_v8f16:
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4
+; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GFX90A-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX90A-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, s3
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
-; GFX90A-NEXT: v_mov_b32_e32 v0, s0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s1
-; GFX90A-NEXT: v_mov_b32_e32 v3, s0
+; GFX90A-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
; GFX90A-NEXT: v_mov_b32_e32 v2, s0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX90A-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX90A-NEXT: v_mov_b32_e32 v3, s1
+; GFX90A-NEXT: v_mov_b32_e32 v5, s0
+; GFX90A-NEXT: v_mov_b32_e32 v4, s0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GFX90A-NEXT: s_endpgm
entry:
%val.1.float = extractelement <2 x float> %in, i64 0
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll b/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll
index 9361187..4d864ad 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll
@@ -25,27 +25,27 @@ define void @shufflevector_v2i32_10_physreg_even_vgpr_pair_copy(ptr addrspace(1)
; GFX90A-LABEL: shufflevector_v2i32_10_physreg_even_vgpr_pair_copy:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v4, v5
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: shufflevector_v2i32_10_physreg_even_vgpr_pair_copy:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v4, v5
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%asm = call { i32, i32 } asm "; def $0, $1", "={v4},={v5}"()
@@ -214,27 +214,27 @@ define void @shufflevector_v2i32_11_physreg_even_vgpr_pair_copy(ptr addrspace(1)
; GFX90A-LABEL: shufflevector_v2i32_11_physreg_even_vgpr_pair_copy:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v4, v5
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v5
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v5
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: shufflevector_v2i32_11_physreg_even_vgpr_pair_copy:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v4, v5
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v5
+; GFX940-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%asm = call { i32, i32 } asm "; def $0, $1", "={v4},={v5}"()
@@ -265,31 +265,31 @@ define void @shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy(ptr addrspace(
; GFX90A-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v4, v5, v6, v7
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v2, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v10, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v6
+; GFX90A-NEXT: v_mov_b32_e32 v8, v7
+; GFX90A-NEXT: v_mov_b32_e32 v11, v4
+; GFX90A-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v8, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v4, v5, v6, v7
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v3, v4
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v10, v5
+; GFX940-NEXT: v_mov_b32_e32 v9, v6
+; GFX940-NEXT: v_mov_b32_e32 v8, v7
+; GFX940-NEXT: v_mov_b32_e32 v11, v4
+; GFX940-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%asm = call { i32, i32, i32, i32 } asm "; def $0, $1, $2, $3", "={v4},={v5},={v6},={v7}"()
@@ -327,31 +327,31 @@ define void @shufflevector_v4i32_1032_physreg_even_vgpr_quad_copy(ptr addrspace(
; GFX90A-LABEL: shufflevector_v4i32_1032_physreg_even_vgpr_quad_copy:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v4, v5, v6, v7
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
-; GFX90A-NEXT: v_mov_b32_e32 v3, v6
-; GFX90A-NEXT: v_mov_b32_e32 v2, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v8, v5
+; GFX90A-NEXT: v_mov_b32_e32 v11, v6
+; GFX90A-NEXT: v_mov_b32_e32 v10, v7
+; GFX90A-NEXT: v_mov_b32_e32 v9, v4
+; GFX90A-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: shufflevector_v4i32_1032_physreg_even_vgpr_quad_copy:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v8, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v4, v5, v6, v7
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
-; GFX940-NEXT: v_mov_b32_e32 v3, v6
-; GFX940-NEXT: v_mov_b32_e32 v2, v7
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v8, v5
+; GFX940-NEXT: v_mov_b32_e32 v11, v6
+; GFX940-NEXT: v_mov_b32_e32 v10, v7
+; GFX940-NEXT: v_mov_b32_e32 v9, v4
+; GFX940-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%asm = call { i32, i32, i32, i32 } asm "; def $0, $1, $2, $3", "={v4},={v5},={v6},={v7}"()
@@ -746,16 +746,15 @@ define i32 @shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt(p
; GFX90A-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v4, v5, v6, v7
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v2, v5
-; GFX90A-NEXT: v_mov_b32_e32 v1, v6
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17]
-; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mov_b32_e32 v10, v5
+; GFX90A-NEXT: v_mov_b32_e32 v9, v6
+; GFX90A-NEXT: v_mov_b32_e32 v8, v7
+; GFX90A-NEXT: v_mov_b32_e32 v11, v4
+; GFX90A-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17]
; GFX90A-NEXT: v_mov_b32_e32 v0, v6
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -763,17 +762,16 @@ define i32 @shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt(p
; GFX940-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v8, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v4, v5, v6, v7
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v5
-; GFX940-NEXT: v_mov_b32_e32 v1, v6
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
-; GFX940-NEXT: v_mov_b32_e32 v3, v4
-; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
-; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_mov_b32_e32 v10, v5
+; GFX940-NEXT: v_mov_b32_e32 v9, v6
+; GFX940-NEXT: v_mov_b32_e32 v8, v7
+; GFX940-NEXT: v_mov_b32_e32 v11, v4
+; GFX940-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
; GFX940-NEXT: v_mov_b32_e32 v0, v6
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
index 6ffc8ca..fa482d9 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
@@ -58,7 +58,8 @@ define amdgpu_kernel void @foo(ptr noundef %fp) {
; OW-NEXT: entry:
; OW-NEXT: [[FP_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
; OW-NEXT: store ptr [[FP]], ptr addrspace(5) [[FP_ADDR]], align 8
-; OW-NEXT: call void [[FP]]()
+; OW-NEXT: [[LOAD:%.*]] = load ptr, ptr addrspace(5) [[FP_ADDR]], align 8
+; OW-NEXT: call void [[LOAD]]()
; OW-NEXT: ret void
;
; CW-LABEL: define {{[^@]+}}@foo
@@ -66,7 +67,8 @@ define amdgpu_kernel void @foo(ptr noundef %fp) {
; CW-NEXT: entry:
; CW-NEXT: [[FP_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
; CW-NEXT: store ptr [[FP]], ptr addrspace(5) [[FP_ADDR]], align 8
-; CW-NEXT: [[TMP0:%.*]] = icmp eq ptr [[FP]], @bar1
+; CW-NEXT: [[LOAD:%.*]] = load ptr, ptr addrspace(5) [[FP_ADDR]], align 8
+; CW-NEXT: [[TMP0:%.*]] = icmp eq ptr [[LOAD]], @bar1
; CW-NEXT: br i1 [[TMP0]], label [[TMP1:%.*]], label [[TMP2:%.*]]
; CW: 1:
; CW-NEXT: call void @bar1()
@@ -86,7 +88,8 @@ define amdgpu_kernel void @foo(ptr noundef %fp) {
; NO-NEXT: entry:
; NO-NEXT: [[FP_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
; NO-NEXT: store ptr [[FP]], ptr addrspace(5) [[FP_ADDR]], align 8
-; NO-NEXT: call void [[FP]](), !callees [[META0:![0-9]+]]
+; NO-NEXT: [[LOAD:%.*]] = load ptr, ptr addrspace(5) [[FP_ADDR]], align 8
+; NO-NEXT: call void [[LOAD]](), !callees [[META0:![0-9]+]]
; NO-NEXT: ret void
;
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index 182ea3ec..65de7f8 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -21,7 +21,7 @@ define amdgpu_kernel void @test_simple_indirect_call() {
; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5)
; ATTRIBUTOR_GCN-NEXT: store ptr @indirect, ptr addrspace(5) [[FPTR]], align 8, !noalias.addrspace [[META0:![0-9]+]]
; ATTRIBUTOR_GCN-NEXT: [[FP:%.*]] = load ptr, ptr addrspace(5) [[FPTR]], align 8, !noalias.addrspace [[META0]]
-; ATTRIBUTOR_GCN-NEXT: call void @indirect()
+; ATTRIBUTOR_GCN-NEXT: call void [[FP]]()
; ATTRIBUTOR_GCN-NEXT: ret void
;
; GFX9-LABEL: test_simple_indirect_call:
@@ -59,7 +59,7 @@ define amdgpu_kernel void @test_simple_indirect_call() {
;.
; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
;.
; ATTRIBUTOR_GCN: [[META0]] = !{i32 1, i32 5, i32 6, i32 10}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
index 5484f77..eb0d546 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
@@ -1,15 +1,107 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -check-prefixes=GCN,GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefixes=GCN,GFX90A %s
-; GCN-LABEL: {{^}}max_12regs_13a_used:
-; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
-; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
-; GCN: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}}
-; GCN-NOT: buffer_store_dword
-; GCN-NOT: buffer_load_dword
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
-; GCN: ScratchSize: 0
define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, ptr addrspace(1) %arg, ptr addrspace(1) %out) #2 {
+; GFX908-LABEL: max_12regs_13a_used:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX908-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX908-NEXT: s_load_dword s0, s[0:1], 0x24
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_cmp_lg_u32 s0, 0
+; GFX908-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v5, s8
+; GFX908-NEXT: v_mov_b32_e32 v1, s9
+; GFX908-NEXT: v_mov_b32_e32 v2, s10
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v5
+; GFX908-NEXT: v_mov_b32_e32 v5, s11
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v5
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v0, a[0:3]
+; GFX908-NEXT: v_mfma_f32_4x4x1f32 a[4:7], v0, v0, a[0:3]
+; GFX908-NEXT: s_cbranch_scc0 .LBB0_2
+; GFX908-NEXT: ; %bb.1: ; %st
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_endpgm
+; GFX908-NEXT: .LBB0_2: ; %use
+; GFX908-NEXT: s_nop 2
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a6
+; GFX908-NEXT: v_accvgpr_write_b32 a4, 4
+; GFX908-NEXT: v_accvgpr_write_b32 a8, 5
+; GFX908-NEXT: v_accvgpr_write_b32 a9, 1
+; GFX908-NEXT: v_accvgpr_write_b32 a10, 2
+; GFX908-NEXT: v_accvgpr_write_b32 a11, 3
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v1
+; GFX908-NEXT: v_mov_b32_e32 v1, v0
+; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v3, v0
+; GFX908-NEXT: v_mov_b32_e32 v4, 0
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: max_12regs_13a_used:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v0, a[0:3]
+; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[4:7], v0, v0, a[0:3]
+; GFX90A-NEXT: s_cbranch_scc0 .LBB0_2
+; GFX90A-NEXT: ; %bb.1: ; %st
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_endpgm
+; GFX90A-NEXT: .LBB0_2: ; %use
+; GFX90A-NEXT: s_nop 3
+; GFX90A-NEXT: v_accvgpr_read_b32 v9, a7
+; GFX90A-NEXT: v_accvgpr_read_b32 v8, a6
+; GFX90A-NEXT: v_accvgpr_read_b32 v7, a5
+; GFX90A-NEXT: v_accvgpr_read_b32 v6, a4
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, 4
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, 5
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, 1
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, 2
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, 3
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v6
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v7
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v8
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v9
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %in.1, i32 0, i32 0, i32 0)
@@ -28,16 +120,64 @@ st:
call void asm sideeffect "", "a,a"(<4 x float> %mai.1, <4 x float> %mai.2)
ret void
}
-
-; GCN-LABEL: {{^}}max_10_vgprs_used_9a:
-; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
-; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
-; GCN: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}}
-; GCN-NOT: buffer_store_dword
-; GCN-NOT: buffer_load_dword
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
; GCN: ScratchSize: 0
+
define amdgpu_kernel void @max_10_vgprs_used_9a() #1 {
+; GFX908-LABEL: max_10_vgprs_used_9a:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a1
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v1
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v5
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: max_10_vgprs_used_9a:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a3
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a2
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v5
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_endpgm
%a1 = call <4 x i32> asm sideeffect "", "=a"()
%a2 = call <4 x i32> asm sideeffect "", "=a"()
%a3 = call i32 asm sideeffect "", "=a"()
@@ -46,17 +186,168 @@ define amdgpu_kernel void @max_10_vgprs_used_9a() #1 {
call void asm sideeffect "", "a"(<2 x i32> %a4)
ret void
}
-
-; GCN-LABEL: {{^}}max_32regs_mfma32:
-; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
-; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
-; GCN-NOT: buffer_store_dword
-; GCN: v_accvgpr_read_b32
-; GCN: v_mfma_f32_32x32x1f32
-; GCN-NOT: buffer_load_dword
-; GCN: v_accvgpr_write_b32
; GCN: ScratchSize: 0
+
define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 {
+; GFX908-LABEL: max_32regs_mfma32:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x40400000
+; GFX908-NEXT: v_mov_b32_e32 v3, 0x40c00000
+; GFX908-NEXT: v_mov_b32_e32 v4, 0x40e00000
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x40a00000
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v2
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x41000000
+; GFX908-NEXT: v_mov_b32_e32 v3, 0x41100000
+; GFX908-NEXT: v_mov_b32_e32 v4, 0x41200000
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x41300000
+; GFX908-NEXT: v_mov_b32_e32 v3, 0x41400000
+; GFX908-NEXT: v_mov_b32_e32 v4, 0x41500000
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x41600000
+; GFX908-NEXT: v_mov_b32_e32 v3, 0x41700000
+; GFX908-NEXT: v_mov_b32_e32 v4, 0x41800000
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x41880000
+; GFX908-NEXT: v_mov_b32_e32 v3, 0x41900000
+; GFX908-NEXT: v_mov_b32_e32 v4, 0x41980000
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x41a00000
+; GFX908-NEXT: v_mov_b32_e32 v3, 0x41a80000
+; GFX908-NEXT: v_mov_b32_e32 v4, 0x41b00000
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x41b80000
+; GFX908-NEXT: v_mov_b32_e32 v3, 0x41c00000
+; GFX908-NEXT: v_mov_b32_e32 v4, 0x41c80000
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x41d00000
+; GFX908-NEXT: v_mov_b32_e32 v3, 0x41d80000
+; GFX908-NEXT: v_mov_b32_e32 v4, 0x41e00000
+; GFX908-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x41e80000
+; GFX908-NEXT: v_mov_b32_e32 v3, 0x41f00000
+; GFX908-NEXT: v_mov_b32_e32 v4, 0x41f80000
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a0
+; GFX908-NEXT: v_accvgpr_write_b32 a0, 1.0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, 2.0
+; GFX908-NEXT: v_accvgpr_write_b32 a3, 4.0
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a31, 2.0
+; GFX908-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v1, a[0:31]
+; GFX908-NEXT: v_mov_b32_e32 v0, 0
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 5
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v5
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: max_32regs_mfma32:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x40400000
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x40a00000
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x40c00000
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x40e00000
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41000000
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41100000
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41200000
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41300000
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41400000
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41500000
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41600000
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41700000
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41800000
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41880000
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41900000
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41980000
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41a00000
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41a80000
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41b00000
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41b80000
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41c00000
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41c80000
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41d00000
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41d80000
+; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41e00000
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41e80000
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41f00000
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, 2.0
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41f80000
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, 1.0
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, 4.0
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, v2
+; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a1
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v1, a[0:31]
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: global_store_dword v0, a0, s[2:3]
+; GFX90A-NEXT: s_endpgm
bb:
%v = call i32 asm sideeffect "", "=a"()
br label %use
@@ -68,42 +359,110 @@ use:
store float %elt1, ptr addrspace(1) %arg
ret void
}
+; GCN: ScratchSize: 0
; Should spill agprs to memory for both gfx908 and gfx90a.
-; GCN-LABEL: {{^}}max_6regs_used_8a:
-; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
-; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
-
-; GFX908-DAG: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
-; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 ; 4-byte Folded Spill
-; GFX908-DAG: v_accvgpr_read_b32 v5, a1 ; Reload Reuse
-; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill
-; GFX908-DAG: v_accvgpr_read_b32 v5, a2 ; Reload Reuse
-; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill
-; GFX908-DAG: v_accvgpr_read_b32 v5, a3 ; Reload Reuse
-; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill
-
-; GFX90A-DAG: buffer_store_dword a0, off, s[{{[0-9:]+}}], 0 ; 4-byte Folded Spill
-; GFX90A-DAG: buffer_store_dword a1, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill
-; GFX90A-DAG: buffer_store_dword a2, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill
-; GFX90A-DAG: buffer_store_dword a3, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill
-
-; GCN: v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
-
-; GFX908-DAG: buffer_load_dword v0, off, s[{{[0-9:]+}}], 0 ; 4-byte Folded Reload
-; GFX908-DAG: buffer_load_dword v1, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Reload
-; GFX908-DAG: buffer_load_dword v2, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Reload
-; GFX908-DAG: buffer_load_dword v3, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Reload
-; GFX908: global_store_dwordx4 v[{{[0-9:]+}}], v[0:3], off
-
-; GFX90A-DAG: buffer_load_dword v2, off, s[4:7], 0 ; 4-byte Folded Reload
-; GFX90A-DAG: buffer_load_dword v3, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload
-; GFX90A-DAG: buffer_load_dword v4, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload
-; GFX90A-DAG: buffer_load_dword v5, off, s[4:7], 0 offset:12 ; 4-byte Folded Reload
-; GFX90A: global_store_dwordx4 v[0:1], v[2:5], off
-
-; GCN: ScratchSize: 20
define amdgpu_kernel void @max_6regs_used_8a(ptr addrspace(1) %arg) #4 {
+; GFX908-LABEL: max_6regs_used_8a:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; GFX908-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
+; GFX908-NEXT: s_mov_b32 s6, -1
+; GFX908-NEXT: s_mov_b32 s7, 0xe00000
+; GFX908-NEXT: s_add_u32 s4, s4, s3
+; GFX908-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ; def v1
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ; def a[0:3]
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_addc_u32 s5, s5, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v1
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: buffer_store_dword v5, off, s[4:7], 0 ; 4-byte Folded Spill
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a1 ; Reload Reuse
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:4 ; 4-byte Folded Spill
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a2 ; Reload Reuse
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:8 ; 4-byte Folded Spill
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a3 ; Reload Reuse
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:12 ; 4-byte Folded Spill
+; GFX908-NEXT: s_waitcnt vmcnt(4)
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v0, a[0:3]
+; GFX908-NEXT: s_nop 3
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX908-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; 4-byte Folded Reload
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload
+; GFX908-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload
+; GFX908-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:12 ; 4-byte Folded Reload
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ; use v0
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: max_6regs_used_8a:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; GFX90A-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
+; GFX90A-NEXT: s_mov_b32 s6, -1
+; GFX90A-NEXT: s_mov_b32 s7, 0xe00000
+; GFX90A-NEXT: s_add_u32 s4, s4, s3
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_addc_u32 s5, s5, 0
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v1
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def a[0:3]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: buffer_store_dword a0, off, s[4:7], 0 ; 4-byte Folded Spill
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_store_dword a1, off, s[4:7], 0 offset:4 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword a2, off, s[4:7], 0 offset:8 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword a3, off, s[4:7], 0 offset:12 ; 4-byte Folded Spill
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: global_load_dwordx4 a[0:3], v0, s[2:3]
+; GFX90A-NEXT: v_mov_b32_e32 v2, 1.0
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v2, v2, a[0:3]
+; GFX90A-NEXT: s_nop 4
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3]
+; GFX90A-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v4, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v5, off, s[4:7], 0 offset:12 ; 4-byte Folded Reload
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use v1
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%v0 = call float asm sideeffect "; def $0", "=v"()
%a4 = call <4 x float> asm sideeffect "; def $0", "=a"()
@@ -115,6 +474,7 @@ define amdgpu_kernel void @max_6regs_used_8a(ptr addrspace(1) %arg) #4 {
call void asm sideeffect "; use $0", "v"(float %v0);
ret void
}
+; GCN: ScratchSize: 20
declare i32 @llvm.amdgcn.workitem.id.x()
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
@@ -125,3 +485,5 @@ attributes #1 = { nounwind "amdgpu-num-vgpr"="10" "amdgpu-no-dispatch-id" "amdgp
attributes #2 = { nounwind "amdgpu-num-vgpr"="12" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
attributes #3 = { nounwind "amdgpu-num-vgpr"="32" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
attributes #4 = { nounwind "amdgpu-num-vgpr"="6" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/ssubo.ll b/llvm/test/CodeGen/AMDGPU/ssubo.ll
index 053038d..382d892 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubo.ll
@@ -1,14 +1,116 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=VI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s --check-prefix=GFX9
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s --check-prefix=GFX10
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s --check-prefix=GFX11
declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
declare { <2 x i32>, <2 x i1> } @llvm.ssub.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-; FUNC-LABEL: {{^}}ssubo_i64_zext:
define amdgpu_kernel void @ssubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
+; SI-LABEL: ssubo_i64_zext:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: s_sub_u32 s10, s2, s8
+; SI-NEXT: s_subb_u32 s11, s3, s9
+; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
+; SI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[8:9], 0
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT: v_mov_b32_e32 v1, s11
+; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v0
+; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: ssubo_i64_zext:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: s_sub_u32 s6, s2, s4
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: s_subb_u32 s7, s3, s5
+; VI-NEXT: v_cmp_gt_i64_e64 s[8:9], s[4:5], 0
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[1:2]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: ssubo_i64_zext:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: s_sub_u32 s4, s2, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: s_subb_u32 s5, s3, s7
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[8:9], s[6:7], 0
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: ssubo_i64_zext:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_sub_u32 s4, s2, s6
+; GFX10-NEXT: s_subb_u32 s5, s3, s7
+; GFX10-NEXT: v_cmp_gt_i64_e64 s6, s[6:7], 0
+; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3]
+; GFX10-NEXT: s_xor_b32 s2, s6, s2
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX10-NEXT: v_add_co_u32 v0, s2, s4, v0
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s5, 0, s2
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: ssubo_i64_zext:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_sub_u32 s6, s2, s4
+; GFX11-NEXT: s_subb_u32 s7, s3, s5
+; GFX11-NEXT: v_cmp_gt_i64_e64 s4, s[4:5], 0
+; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], s[2:3]
+; GFX11-NEXT: s_xor_b32 s2, s4, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-NEXT: v_add_co_u32 v0, s2, s6, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
%val = extractvalue { i64, i1 } %ssub, 0
%carry = extractvalue { i64, i1 } %ssub, 1
@@ -18,8 +120,102 @@ define amdgpu_kernel void @ssubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
ret void
}
-; FUNC-LABEL: {{^}}s_ssubo_i32:
define amdgpu_kernel void @s_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) nounwind {
+; SI-LABEL: s_ssubo_i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_sub_i32 s12, s8, s9
+; SI-NEXT: s_cmp_gt_i32 s9, 0
+; SI-NEXT: s_cselect_b64 s[10:11], -1, 0
+; SI-NEXT: s_cmp_lt_i32 s12, s8
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_cselect_b64 s[8:9], -1, 0
+; SI-NEXT: v_mov_b32_e32 v0, s12
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_xor_b64 s[4:5], s[10:11], s[8:9]
+; SI-NEXT: s_mov_b32 s0, s2
+; SI-NEXT: s_mov_b32 s1, s3
+; SI-NEXT: s_mov_b32 s2, s6
+; SI-NEXT: s_mov_b32 s3, s7
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_ssubo_i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_sub_i32 s6, s4, s5
+; VI-NEXT: s_cmp_gt_i32 s5, 0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: s_cmp_lt_i32 s6, s4
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT: v_mov_b32_e32 v4, s6
+; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT: flat_store_dword v[0:1], v4
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: flat_store_byte v[2:3], v0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: s_ssubo_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: s_sub_i32 s4, s6, s7
+; GFX9-NEXT: v_sub_i32 v1, s6, v1 clamp
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_ssubo_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_sub_nc_i32 v0, s6, s7 clamp
+; GFX10-NEXT: s_sub_i32 s4, s6, s7
+; GFX10-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s4, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: global_store_dword v1, v2, s[0:1]
+; GFX10-NEXT: global_store_byte v1, v0, s[2:3]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_ssubo_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_sub_nc_i32 v0, s6, s7 clamp
+; GFX11-NEXT: s_sub_i32 s4, s6, s7
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s4, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v1, v2, s[0:1]
+; GFX11-NEXT: global_store_b8 v1, v0, s[2:3]
+; GFX11-NEXT: s_endpgm
%ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
%val = extractvalue { i32, i1 } %ssub, 0
%carry = extractvalue { i32, i1 } %ssub, 1
@@ -28,8 +224,112 @@ define amdgpu_kernel void @s_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
ret void
}
-; FUNC-LABEL: {{^}}v_ssubo_i32:
define amdgpu_kernel void @v_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
+; SI-LABEL: v_ssubo_i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s4
+; SI-NEXT: s_mov_b32 s13, s5
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
+; SI-NEXT: s_mov_b32 s6, s10
+; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: s_mov_b32 s4, s2
+; SI-NEXT: s_mov_b32 s5, s3
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_sub_i32_e32 v2, vcc, v0, v1
+; SI-NEXT: v_cmp_lt_i32_e32 vcc, 0, v1
+; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v0
+; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0
+; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: v_ssubo_i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: flat_load_dword v5, v[2:3]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_sub_u32_e32 v6, vcc, v4, v5
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5
+; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], v6, v4
+; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: flat_store_dword v[0:1], v6
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: flat_store_byte v[2:3], v0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_ssubo_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[12:13]
+; GFX9-NEXT: global_load_dword v2, v0, s[14:15]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_sub_i32 v3, v1, v2 clamp
+; GFX9-NEXT: v_sub_u32_e32 v1, v1, v2
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v1, v3
+; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: global_store_byte v0, v1, s[10:11]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_ssubo_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dword v1, v0, s[12:13]
+; GFX10-NEXT: global_load_dword v2, v0, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_sub_nc_i32 v3, v1, v2 clamp
+; GFX10-NEXT: v_sub_nc_u32_e32 v1, v1, v2
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT: global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_ssubo_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_sub_nc_i32 v3, v1, v2 clamp
+; GFX11-NEXT: v_sub_nc_u32_e32 v1, v1, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v3
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: s_endpgm
%a = load i32, ptr addrspace(1) %aptr, align 4
%b = load i32, ptr addrspace(1) %bptr, align 4
%ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
@@ -40,10 +340,109 @@ define amdgpu_kernel void @v_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
ret void
}
-; FUNC-LABEL: {{^}}s_ssubo_i64:
-; GCN: s_sub_u32
-; GCN: s_subb_u32
define amdgpu_kernel void @s_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) nounwind {
+; SI-LABEL: s_ssubo_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_sub_u32 s12, s4, s6
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: s_subb_u32 s13, s5, s7
+; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
+; SI-NEXT: v_cmp_gt_i64_e64 s[4:5], s[6:7], 0
+; SI-NEXT: v_mov_b32_e32 v0, s12
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: v_mov_b32_e32 v1, s13
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], vcc
+; SI-NEXT: s_mov_b32 s0, s2
+; SI-NEXT: s_mov_b32 s1, s3
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_ssubo_i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_sub_u32 s0, s4, s6
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_subb_u32 s1, s5, s7
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
+; VI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: flat_store_byte v[2:3], v0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: s_ssubo_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_sub_u32 s0, s12, s14
+; GFX9-NEXT: v_mov_b32_e32 v0, s12
+; GFX9-NEXT: v_mov_b32_e32 v1, s13
+; GFX9-NEXT: s_subb_u32 s1, s13, s15
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[14:15], 0
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT: global_store_byte v2, v0, s[10:11]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_ssubo_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_sub_u32 s0, s12, s14
+; GFX10-NEXT: s_subb_u32 s1, s13, s15
+; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[14:15], 0
+; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[12:13]
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: s_xor_b32 s0, s2, s3
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX10-NEXT: global_store_byte v2, v3, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_ssubo_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_sub_u32 s8, s4, s6
+; GFX11-NEXT: s_subb_u32 s9, s5, s7
+; GFX11-NEXT: v_cmp_gt_i64_e64 s6, s[6:7], 0
+; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5]
+; GFX11-NEXT: v_mov_b32_e32 v0, s8
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX11-NEXT: s_xor_b32 s4, s6, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT: s_endpgm
%ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
%val = extractvalue { i64, i1 } %ssub, 0
%carry = extractvalue { i64, i1 } %ssub, 1
@@ -52,16 +451,121 @@ define amdgpu_kernel void @s_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
ret void
}
-; FUNC-LABEL: {{^}}v_ssubo_i64:
-; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc,
-; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc,
-
-; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc,
-; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc,
-
-; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc,
-; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc,
define amdgpu_kernel void @v_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
+; SI-LABEL: v_ssubo_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s4
+; SI-NEXT: s_mov_b32 s13, s5
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
+; SI-NEXT: s_mov_b32 s6, s10
+; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: s_mov_b32 s4, s2
+; SI-NEXT: s_mov_b32 s5, s3
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
+; SI-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc
+; SI-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3]
+; SI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1]
+; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0
+; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: v_ssubo_i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v6, s2
+; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_sub_u32_e32 v8, vcc, v0, v2
+; VI-NEXT: v_subb_u32_e32 v9, vcc, v1, v3, vcc
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3]
+; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1]
+; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9]
+; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: flat_store_byte v[6:7], v0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_ssubo_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[12:13]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[14:15]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1]
+; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9]
+; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT: global_store_byte v6, v0, s[10:11]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_ssubo_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v6, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[12:13]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
+; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
+; GFX10-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
+; GFX10-NEXT: s_xor_b32 s0, vcc_lo, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX10-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9]
+; GFX10-NEXT: global_store_byte v6, v0, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_ssubo_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[4:11], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v6, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b64 v[0:1], v6, s[8:9]
+; GFX11-NEXT: global_load_b64 v[2:3], v6, s[10:11]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
+; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
+; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
+; GFX11-NEXT: s_xor_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b64 v6, v[4:5], s[4:5]
+; GFX11-NEXT: global_store_b8 v6, v0, s[6:7]
+; GFX11-NEXT: s_endpgm
%a = load i64, ptr addrspace(1) %aptr, align 4
%b = load i64, ptr addrspace(1) %bptr, align 4
%ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
@@ -72,14 +576,134 @@ define amdgpu_kernel void @v_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
ret void
}
-; FUNC-LABEL: {{^}}v_ssubo_v2i32:
-; SICIVI: v_cmp_lt_i32
-; SICIVI: v_cmp_lt_i32
-; SICIVI: v_sub_{{[iu]}}32
-; SICIVI: v_cmp_lt_i32
-; SICIVI: v_cmp_lt_i32
-; SICIVI: v_sub_{{[iu]}}32
define amdgpu_kernel void @v_ssubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
+; SI-LABEL: v_ssubo_v2i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s4
+; SI-NEXT: s_mov_b32 s13, s5
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
+; SI-NEXT: s_mov_b32 s6, s10
+; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_sub_i32_e32 v5, vcc, v1, v3
+; SI-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
+; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 0, v3
+; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], v5, v1
+; SI-NEXT: v_cmp_lt_i32_e32 vcc, 0, v2
+; SI-NEXT: v_cmp_lt_i32_e64 s[2:3], v4, v0
+; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
+; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; SI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3]
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: v_ssubo_v2i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v6, s2
+; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_sub_u32_e32 v9, vcc, v1, v3
+; VI-NEXT: v_sub_u32_e32 v8, vcc, v0, v2
+; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], 0, v3
+; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v1
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 0, v2
+; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v0
+; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
+; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; VI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3]
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9]
+; VI-NEXT: flat_store_dwordx2 v[6:7], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_ssubo_v2i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[12:13]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[14:15]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_sub_u32_e32 v5, v1, v3
+; GFX9-NEXT: v_sub_i32 v1, v1, v3 clamp
+; GFX9-NEXT: v_sub_u32_e32 v4, v0, v2
+; GFX9-NEXT: v_sub_i32 v0, v0, v2 clamp
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v5, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v4, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9]
+; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[10:11]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_ssubo_v2i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v5, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v5, s[12:13]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v5, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_sub_nc_u32_e32 v4, v1, v3
+; GFX10-NEXT: v_sub_nc_i32 v1, v1, v3 clamp
+; GFX10-NEXT: v_sub_nc_u32_e32 v3, v0, v2
+; GFX10-NEXT: v_sub_nc_i32 v0, v0, v2 clamp
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: global_store_dwordx2 v5, v[3:4], s[8:9]
+; GFX10-NEXT: global_store_dwordx2 v5, v[0:1], s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_ssubo_v2i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v5, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b64 v[0:1], v5, s[4:5]
+; GFX11-NEXT: global_load_b64 v[2:3], v5, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_sub_nc_u32_e32 v4, v1, v3
+; GFX11-NEXT: v_sub_nc_i32 v1, v1, v3 clamp
+; GFX11-NEXT: v_sub_nc_u32_e32 v3, v0, v2
+; GFX11-NEXT: v_sub_nc_i32 v0, v0, v2 clamp
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b64 v5, v[3:4], s[0:1]
+; GFX11-NEXT: global_store_b64 v5, v[0:1], s[2:3]
+; GFX11-NEXT: s_endpgm
%a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
%b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
%sadd = call { <2 x i32>, <2 x i1> } @llvm.ssub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
diff --git a/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll b/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll
new file mode 100644
index 0000000..42436a1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll
@@ -0,0 +1,180 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX900 %s
+
+
+%pair = type { i32, i32 }
+
+define void @test_extractvalue_then_else(ptr %ptr, i1 %cond) {
+; GFX900-LABEL: test_extractvalue_then_else:
+; GFX900: ; %bb.0: ; %if
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: flat_load_dword v3, v[0:1]
+; GFX900-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX900-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2
+; GFX900-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX900-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX900-NEXT: s_cbranch_execz .LBB0_2
+; GFX900-NEXT: ; %bb.1: ; %else
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_add_u32_e32 v3, 1, v3
+; GFX900-NEXT: .LBB0_2: ; %Flow
+; GFX900-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX900-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: flat_store_dword v[0:1], v3
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+if:
+ %load_then = load %pair, ptr %ptr
+ br i1 %cond, label %then, label %else
+
+then:
+ %a_then = extractvalue %pair %load_then, 0
+ br label %merge
+
+else:
+ %a_else = extractvalue %pair %load_then, 0
+ %sum_else = add i32 %a_else, 1
+ br label %merge
+
+merge:
+ %phi = phi i32 [ %a_then, %then ], [ %sum_else, %else ]
+ store i32 %phi, ptr %ptr
+ ret void
+}
+
+define void @test_extractvalue_else_then(ptr %ptr, i1 %cond) {
+; GFX900-LABEL: test_extractvalue_else_then:
+; GFX900: ; %bb.0: ; %if
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: flat_load_dword v3, v[0:1]
+; GFX900-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX900-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2
+; GFX900-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX900-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX900-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX900-NEXT: s_cbranch_execz .LBB1_2
+; GFX900-NEXT: ; %bb.1: ; %else
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_add_u32_e32 v3, 1, v3
+; GFX900-NEXT: .LBB1_2: ; %merge
+; GFX900-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: flat_store_dword v[0:1], v3
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+if:
+ %load_then = load %pair, ptr %ptr
+ br i1 %cond, label %else, label %then
+
+else:
+ %a_else = extractvalue %pair %load_then, 0
+ %sum_else = add i32 %a_else, 1
+ br label %merge
+
+then:
+ %a_then = extractvalue %pair %load_then, 0
+ br label %merge
+
+merge:
+ %phi = phi i32 [ %a_then, %then ], [ %sum_else, %else ]
+ store i32 %phi, ptr %ptr
+ ret void
+}
+
+define amdgpu_kernel void @test_loop_with_if( ptr %ptr, i1 %cond) #0 {
+; GFX900-LABEL: test_loop_with_if:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: v_mov_b32_e32 v5, 0
+; GFX900-NEXT: s_mov_b64 s[4:5], 0
+; GFX900-NEXT: s_movk_i32 s10, 0xfe
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: s_bitcmp1_b32 s2, 0
+; GFX900-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX900-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[2:3]
+; GFX900-NEXT: v_mov_b32_e32 v2, s1
+; GFX900-NEXT: s_xor_b64 s[2:3], s[2:3], -1
+; GFX900-NEXT: v_mov_b32_e32 v1, s0
+; GFX900-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v3
+; GFX900-NEXT: s_branch .LBB2_2
+; GFX900-NEXT: .LBB2_1: ; %latch
+; GFX900-NEXT: ; in Loop: Header=BB2_2 Depth=1
+; GFX900-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_add_u32_e32 v5, 20, v3
+; GFX900-NEXT: v_cmp_lt_i32_e32 vcc, s10, v5
+; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX900-NEXT: flat_store_dword v[1:2], v3
+; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX900-NEXT: s_cbranch_execz .LBB2_8
+; GFX900-NEXT: .LBB2_2: ; %loop
+; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT: flat_load_dwordx2 v[3:4], v[1:2]
+; GFX900-NEXT: s_and_b64 vcc, exec, s[0:1]
+; GFX900-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX900-NEXT: s_mov_b64 s[6:7], 0
+; GFX900-NEXT: s_cbranch_vccnz .LBB2_4
+; GFX900-NEXT: ; %bb.3: ; %if
+; GFX900-NEXT: ; in Loop: Header=BB2_2 Depth=1
+; GFX900-NEXT: v_cmp_gt_i32_e32 vcc, 11, v5
+; GFX900-NEXT: s_andn2_b64 s[8:9], s[2:3], exec
+; GFX900-NEXT: s_and_b64 s[12:13], vcc, exec
+; GFX900-NEXT: s_mov_b64 s[6:7], -1
+; GFX900-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX900-NEXT: .LBB2_4: ; %Flow
+; GFX900-NEXT: ; in Loop: Header=BB2_2 Depth=1
+; GFX900-NEXT: s_and_saveexec_b64 s[12:13], s[8:9]
+; GFX900-NEXT: s_xor_b64 s[8:9], exec, s[12:13]
+; GFX900-NEXT: s_cbranch_execz .LBB2_6
+; GFX900-NEXT: ; %bb.5: ; %else
+; GFX900-NEXT: ; in Loop: Header=BB2_2 Depth=1
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_add_u32_e32 v3, v3, v4
+; GFX900-NEXT: s_andn2_b64 s[6:7], s[6:7], exec
+; GFX900-NEXT: .LBB2_6: ; %Flow1
+; GFX900-NEXT: ; in Loop: Header=BB2_2 Depth=1
+; GFX900-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX900-NEXT: s_and_saveexec_b64 s[8:9], s[6:7]
+; GFX900-NEXT: s_cbranch_execz .LBB2_1
+; GFX900-NEXT: ; %bb.7: ; %then
+; GFX900-NEXT: ; in Loop: Header=BB2_2 Depth=1
+; GFX900-NEXT: flat_store_dword v[1:2], v0
+; GFX900-NEXT: s_branch .LBB2_1
+; GFX900-NEXT: .LBB2_8: ; %end
+; GFX900-NEXT: s_endpgm
+entry:
+ %a = tail call i32 @llvm.amdgcn.workitem.id.x()
+ br label %loop
+
+loop:
+ %entry_phi = phi i32 [ 0, %entry ], [ %a15, %latch ]
+ %load = load %pair, ptr %ptr
+ br i1 %cond, label %if, label %else
+
+if:
+ %cmp = icmp sgt i32 %entry_phi, 10
+ br i1 %cmp, label %then, label %else
+
+then:
+ %a_then = extractvalue %pair %load, 0
+ store i32 %a, ptr %ptr, align 4
+ br label %latch
+
+else:
+ %a2 = extractvalue %pair %load, 1
+ %y = extractvalue %pair %load, 0
+ %a_else = add i32 %y, %a2
+ br label %latch
+
+latch:
+ %a_test = phi i32 [ %a_then, %then ], [ %a_else, %else ]
+ store i32 %a_test, ptr %ptr
+ %a15 = add nsw i32 %a_test, 20
+ %a16 = icmp slt i32 %a15, 255
+ br i1 %a16, label %loop, label %end
+
+end:
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index d23e314..f6c357d 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -70,12 +70,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GLOBALNESS1-NEXT: s_mov_b64 s[38:39], s[8:9]
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[8:9], 1, v1
-; GLOBALNESS1-NEXT: ; implicit-def: $vgpr59 : SGPR spill to VGPR lane
+; GLOBALNESS1-NEXT: ; implicit-def: $vgpr57 : SGPR spill to VGPR lane
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0
; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s8, 0
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s8, 0
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[68:69], 1, v0
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s9, 1
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s9, 1
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[70:71], 1, v3
; GLOBALNESS1-NEXT: v_mov_b32_e32 v46, 0x80
; GLOBALNESS1-NEXT: s_mov_b32 s82, s16
@@ -84,7 +84,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_mov_b64 s[34:35], s[10:11]
; GLOBALNESS1-NEXT: v_mov_b32_e32 v47, 0
; GLOBALNESS1-NEXT: s_mov_b32 s32, 0
-; GLOBALNESS1-NEXT: ; implicit-def: $vgpr56_vgpr57
+; GLOBALNESS1-NEXT: ; implicit-def: $vgpr58_vgpr59
; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0)
; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -93,24 +93,24 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0
; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s4, 2
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s4, 2
; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s5, 3
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s5, 3
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v3
; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s4, 4
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s5, 5
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s4, 4
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s5, 5
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v2
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s4, 6
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s5, 7
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s4, 6
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s5, 7
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[80:81], 1, v1
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s70, 8
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s71, 9
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s70, 8
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s71, 9
; GLOBALNESS1-NEXT: s_branch .LBB1_4
; GLOBALNESS1-NEXT: .LBB1_1: ; %bb70.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: v_readlane_b32 s6, v59, 6
-; GLOBALNESS1-NEXT: v_readlane_b32 s7, v59, 7
+; GLOBALNESS1-NEXT: v_readlane_b32 s6, v57, 6
+; GLOBALNESS1-NEXT: v_readlane_b32 s7, v57, 7
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7]
; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_28
; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow15
@@ -120,7 +120,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: .LBB1_3: ; %Flow28
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7]
-; GLOBALNESS1-NEXT: v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1]
+; GLOBALNESS1-NEXT: v_pk_mov_b32 v[58:59], v[0:1], v[0:1] op_sel:[0,1]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_29
; GLOBALNESS1-NEXT: .LBB1_4: ; %bb5
; GLOBALNESS1-NEXT: ; =>This Loop Header: Depth=1
@@ -128,7 +128,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: flat_load_dword v40, v[46:47]
; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40
; GLOBALNESS1-NEXT: buffer_store_dword v42, off, s[0:3], 0
-; GLOBALNESS1-NEXT: flat_load_dword v58, v[46:47]
+; GLOBALNESS1-NEXT: flat_load_dword v56, v[46:47]
; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0
; GLOBALNESS1-NEXT: s_getpc_b64 s[4:5]
; GLOBALNESS1-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4
@@ -186,10 +186,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: ; %bb.11: ; %bb33.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[44:45], off
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s8, 10
-; GLOBALNESS1-NEXT: v_writelane_b32 v59, s9, 11
-; GLOBALNESS1-NEXT: v_readlane_b32 s4, v59, 2
-; GLOBALNESS1-NEXT: v_readlane_b32 s5, v59, 3
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s8, 10
+; GLOBALNESS1-NEXT: v_writelane_b32 v57, s9, 11
+; GLOBALNESS1-NEXT: v_readlane_b32 s4, v57, 2
+; GLOBALNESS1-NEXT: v_readlane_b32 s5, v57, 3
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[4:5]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_13
; GLOBALNESS1-NEXT: ; %bb.12: ; %bb39.i
@@ -198,7 +198,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS1-NEXT: .LBB1_13: ; %bb44.lr.ph.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v58
+; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56
; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc
; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0)
; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
@@ -228,8 +228,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21
; GLOBALNESS1-NEXT: ; %bb.19: ; %bb3.i.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT: v_readlane_b32 s4, v59, 0
-; GLOBALNESS1-NEXT: v_readlane_b32 s5, v59, 1
+; GLOBALNESS1-NEXT: v_readlane_b32 s4, v57, 0
+; GLOBALNESS1-NEXT: v_readlane_b32 s5, v57, 1
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[4:5]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21
; GLOBALNESS1-NEXT: ; %bb.20: ; %bb6.i.i
@@ -265,7 +265,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_mov_b32 s13, s83
; GLOBALNESS1-NEXT: s_mov_b32 s14, s82
; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41
-; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[56:57], off
+; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[58:59], off
; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[54:55]
; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[96:97]
; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_14
@@ -277,13 +277,13 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: .LBB1_24: ; %Flow23
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: s_load_dwordx4 s[4:7], s[38:39], 0x0
-; GLOBALNESS1-NEXT: v_readlane_b32 s70, v59, 8
-; GLOBALNESS1-NEXT: v_readlane_b32 s8, v59, 10
+; GLOBALNESS1-NEXT: v_readlane_b32 s70, v57, 8
+; GLOBALNESS1-NEXT: v_readlane_b32 s8, v57, 10
; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS1-NEXT: v_readlane_b32 s71, v59, 9
+; GLOBALNESS1-NEXT: v_readlane_b32 s71, v57, 9
; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0)
; GLOBALNESS1-NEXT: s_mov_b32 s55, s7
-; GLOBALNESS1-NEXT: v_readlane_b32 s9, v59, 11
+; GLOBALNESS1-NEXT: v_readlane_b32 s9, v57, 11
; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow24
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[52:53]
@@ -291,8 +291,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2
; GLOBALNESS1-NEXT: ; %bb.26: ; %bb67.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: v_readlane_b32 s6, v59, 4
-; GLOBALNESS1-NEXT: v_readlane_b32 s7, v59, 5
+; GLOBALNESS1-NEXT: v_readlane_b32 s6, v57, 4
+; GLOBALNESS1-NEXT: v_readlane_b32 s7, v57, 5
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7]
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1
; GLOBALNESS1-NEXT: ; %bb.27: ; %bb69.i
@@ -384,12 +384,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GLOBALNESS0-NEXT: s_mov_b64 s[38:39], s[8:9]
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[8:9], 1, v1
-; GLOBALNESS0-NEXT: ; implicit-def: $vgpr59 : SGPR spill to VGPR lane
+; GLOBALNESS0-NEXT: ; implicit-def: $vgpr57 : SGPR spill to VGPR lane
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0
; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s8, 0
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s8, 0
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[68:69], 1, v0
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s9, 1
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s9, 1
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[84:85], 1, v3
; GLOBALNESS0-NEXT: v_mov_b32_e32 v46, 0x80
; GLOBALNESS0-NEXT: s_mov_b32 s70, s16
@@ -398,7 +398,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[10:11]
; GLOBALNESS0-NEXT: v_mov_b32_e32 v47, 0
; GLOBALNESS0-NEXT: s_mov_b32 s32, 0
-; GLOBALNESS0-NEXT: ; implicit-def: $vgpr56_vgpr57
+; GLOBALNESS0-NEXT: ; implicit-def: $vgpr58_vgpr59
; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0)
; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -407,24 +407,24 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0
; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s4, 2
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s4, 2
; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s5, 3
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s5, 3
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v3
; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s4, 4
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s5, 5
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s4, 4
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s5, 5
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v2
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s4, 6
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s5, 7
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s4, 6
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s5, 7
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[80:81], 1, v1
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s84, 8
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s85, 9
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s84, 8
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s85, 9
; GLOBALNESS0-NEXT: s_branch .LBB1_4
; GLOBALNESS0-NEXT: .LBB1_1: ; %bb70.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: v_readlane_b32 s6, v59, 6
-; GLOBALNESS0-NEXT: v_readlane_b32 s7, v59, 7
+; GLOBALNESS0-NEXT: v_readlane_b32 s6, v57, 6
+; GLOBALNESS0-NEXT: v_readlane_b32 s7, v57, 7
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7]
; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_28
; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow15
@@ -434,7 +434,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: .LBB1_3: ; %Flow28
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7]
-; GLOBALNESS0-NEXT: v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1]
+; GLOBALNESS0-NEXT: v_pk_mov_b32 v[58:59], v[0:1], v[0:1] op_sel:[0,1]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_29
; GLOBALNESS0-NEXT: .LBB1_4: ; %bb5
; GLOBALNESS0-NEXT: ; =>This Loop Header: Depth=1
@@ -442,7 +442,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: flat_load_dword v40, v[46:47]
; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40
; GLOBALNESS0-NEXT: buffer_store_dword v42, off, s[0:3], 0
-; GLOBALNESS0-NEXT: flat_load_dword v58, v[46:47]
+; GLOBALNESS0-NEXT: flat_load_dword v56, v[46:47]
; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0
; GLOBALNESS0-NEXT: s_getpc_b64 s[4:5]
; GLOBALNESS0-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4
@@ -500,10 +500,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: ; %bb.11: ; %bb33.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[44:45], off
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s8, 10
-; GLOBALNESS0-NEXT: v_writelane_b32 v59, s9, 11
-; GLOBALNESS0-NEXT: v_readlane_b32 s4, v59, 2
-; GLOBALNESS0-NEXT: v_readlane_b32 s5, v59, 3
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s8, 10
+; GLOBALNESS0-NEXT: v_writelane_b32 v57, s9, 11
+; GLOBALNESS0-NEXT: v_readlane_b32 s4, v57, 2
+; GLOBALNESS0-NEXT: v_readlane_b32 s5, v57, 3
; GLOBALNESS0-NEXT: s_mov_b32 s83, s55
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[4:5]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_13
@@ -513,7 +513,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS0-NEXT: .LBB1_13: ; %bb44.lr.ph.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v58
+; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56
; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc
; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0)
; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
@@ -543,8 +543,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21
; GLOBALNESS0-NEXT: ; %bb.19: ; %bb3.i.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT: v_readlane_b32 s4, v59, 0
-; GLOBALNESS0-NEXT: v_readlane_b32 s5, v59, 1
+; GLOBALNESS0-NEXT: v_readlane_b32 s4, v57, 0
+; GLOBALNESS0-NEXT: v_readlane_b32 s5, v57, 1
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[4:5]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21
; GLOBALNESS0-NEXT: ; %bb.20: ; %bb6.i.i
@@ -580,7 +580,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_mov_b32 s13, s71
; GLOBALNESS0-NEXT: s_mov_b32 s14, s70
; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41
-; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[56:57], off
+; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[58:59], off
; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[54:55]
; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[96:97]
; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_14
@@ -591,12 +591,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_branch .LBB1_14
; GLOBALNESS0-NEXT: .LBB1_24: ; %Flow23
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: v_readlane_b32 s84, v59, 8
-; GLOBALNESS0-NEXT: v_readlane_b32 s8, v59, 10
+; GLOBALNESS0-NEXT: v_readlane_b32 s84, v57, 8
+; GLOBALNESS0-NEXT: v_readlane_b32 s8, v57, 10
; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0
; GLOBALNESS0-NEXT: s_mov_b32 s55, s83
-; GLOBALNESS0-NEXT: v_readlane_b32 s85, v59, 9
-; GLOBALNESS0-NEXT: v_readlane_b32 s9, v59, 11
+; GLOBALNESS0-NEXT: v_readlane_b32 s85, v57, 9
+; GLOBALNESS0-NEXT: v_readlane_b32 s9, v57, 11
; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow24
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[52:53]
@@ -604,8 +604,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2
; GLOBALNESS0-NEXT: ; %bb.26: ; %bb67.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: v_readlane_b32 s6, v59, 4
-; GLOBALNESS0-NEXT: v_readlane_b32 s7, v59, 5
+; GLOBALNESS0-NEXT: v_readlane_b32 s6, v57, 4
+; GLOBALNESS0-NEXT: v_readlane_b32 s7, v57, 5
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7]
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1
; GLOBALNESS0-NEXT: ; %bb.27: ; %bb69.i
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index d230ff5..e1574dc 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=VI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s --check-prefix=GFX9
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s --check-prefix=GFX10
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s --check-prefix=GFX11
define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
; SI-LABEL: s_uaddo_i64_zext:
@@ -12,14 +14,14 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_add_u32 s0, s2, s8
; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_addc_u32 s1, s3, s9
+; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -61,6 +63,40 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_uaddo_i64_zext:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_add_u32 s4, s2, s6
+; GFX10-NEXT: s_addc_u32 s5, s3, s7
+; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[4:5], s[2:3]
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX10-NEXT: v_add_co_u32 v0, s2, s4, v0
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s5, 0, s2
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_uaddo_i64_zext:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_add_u32 s4, s2, s4
+; GFX11-NEXT: s_addc_u32 s5, s3, s5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_lt_u64_e64 s2, s[4:5], s[2:3]
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_u32 v0, s2, s4, v0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
%val = extractvalue { i64, i1 } %uadd, 0
%carry = extractvalue { i64, i1 } %uadd, 1
@@ -76,21 +112,21 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-LABEL: s_uaddo_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xd
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s10, s6
-; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: v_mov_b32_e32 v0, s9
; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: s_mov_b32 s8, s2
-; SI-NEXT: s_mov_b32 s9, s3
-; SI-NEXT: v_mov_b32_e32 v0, s13
-; SI-NEXT: v_add_i32_e32 v0, vcc, s12, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, s8, v0
+; SI-NEXT: s_mov_b32 s0, s2
+; SI-NEXT: s_mov_b32 s1, s3
+; SI-NEXT: s_mov_b32 s2, s6
+; SI-NEXT: s_mov_b32 s3, s7
; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0
+; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_uaddo_i32:
@@ -121,6 +157,34 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: global_store_byte v0, v2, s[2:3]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_uaddo_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v1, s4, s6, s7
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_uaddo_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v1, s4, s6, s7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: s_endpgm
%uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %uadd, 0
%carry = extractvalue { i32, i1 } %uadd, 1
@@ -137,17 +201,15 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
-; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -193,6 +255,38 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
; GFX9-NEXT: global_store_byte v0, v2, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_uaddo_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dword v1, v0, s[12:13]
+; GFX10-NEXT: global_load_dword v2, v0, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT: global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_uaddo_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v1, s4, v1, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
@@ -215,17 +309,15 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
-; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -283,6 +375,45 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
; GFX9-NEXT: global_store_byte v0, v2, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_uaddo_i32_novcc:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dword v1, v0, s[12:13]
+; GFX10-NEXT: global_load_dword v2, v0, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_uaddo_i32_novcc:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v1, s4, v1, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
@@ -306,21 +437,21 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_add_u32 s6, s4, s6
-; SI-NEXT: s_addc_u32 s7, s5, s7
-; SI-NEXT: s_mov_b32 s14, s10
-; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s8, s0
-; SI-NEXT: s_mov_b32 s9, s1
-; SI-NEXT: s_mov_b32 s12, s2
-; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: s_addc_u32 s7, s5, s7
; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
; SI-NEXT: v_mov_b32_e32 v2, s6
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: s_mov_b32 s0, s2
+; SI-NEXT: s_mov_b32 s1, s3
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
; SI-NEXT: v_mov_b32_e32 v3, s7
-; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
-; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0
+; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
+; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_uaddo_i64:
@@ -359,6 +490,37 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9]
; GFX9-NEXT: global_store_byte v4, v0, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_uaddo_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_add_u32 s0, s12, s14
+; GFX10-NEXT: s_addc_u32 s1, s13, s15
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[12:13]
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX10-NEXT: global_store_byte v2, v3, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_uaddo_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_add_u32 s6, s4, s6
+; GFX11-NEXT: s_addc_u32 s7, s5, s7
+; GFX11-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT: s_endpgm
%uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
%val = extractvalue { i64, i1 } %uadd, 0
%carry = extractvalue { i64, i1 } %uadd, 1
@@ -375,17 +537,15 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
-; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -393,8 +553,8 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, v0, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc
-; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
+; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
@@ -437,6 +597,42 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: global_store_byte v4, v0, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_uaddo_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9]
+; GFX10-NEXT: global_store_byte v4, v0, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_uaddo_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5]
+; GFX11-NEXT: global_load_b64 v[2:3], v4, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-NEXT: global_store_b8 v4, v0, s[2:3]
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i64, ptr addrspace(1) %a.ptr
@@ -459,17 +655,15 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
-; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -477,8 +671,8 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0
-; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, v1, v0
+; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
@@ -522,6 +716,42 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_store_short v0, v2, s[8:9]
; GFX9-NEXT: global_store_byte v0, v1, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_uaddo_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_ushort v1, v0, s[12:13]
+; GFX10-NEXT: global_load_ushort v2, v0, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v2
+; GFX10-NEXT: v_cmp_lt_u32_sdwa s0, v2, v1 src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX10-NEXT: global_store_short v0, v2, s[8:9]
+; GFX10-NEXT: global_store_byte v0, v1, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_uaddo_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_d16_b16 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_u16 v2, v0, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_nc_u32_e32 v2, v1, v2
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; GFX11-NEXT: v_cmp_lt_u32_e32 vcc_lo, v3, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v1, s[2:3]
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr
@@ -544,17 +774,15 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
-; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -606,6 +834,42 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_uaddo_v2i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_uaddo_v2i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5]
+; GFX11-NEXT: global_load_b64 v[2:3], v4, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v1, s4, v1, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT: v_add_co_u32 v0, s4, v0, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v4, v[2:3], s[2:3]
+; GFX11-NEXT: s_endpgm
%a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
%b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
%sadd = call { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
@@ -623,26 +887,27 @@ define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s1
-; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; SI-NEXT: s_cmp_eq_u32 s0, s1
+; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_cbranch_scc1 .LBB8_2
; SI-NEXT: ; %bb.1: ; %if
; SI-NEXT: s_xor_b64 s[0:1], vcc, -1
; SI-NEXT: .LBB8_2: ; %exit
; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; SI-NEXT: s_mov_b32 s10, s2
-; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s0, s4
-; SI-NEXT: s_mov_b32 s1, s5
-; SI-NEXT: s_mov_b32 s8, s6
-; SI-NEXT: s_mov_b32 s9, s7
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
+; SI-NEXT: s_mov_b32 s6, s10
+; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_uaddo_clamp_bit:
@@ -687,6 +952,45 @@ define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
; GFX9-NEXT: global_store_dword v1, v0, s[8:9]
; GFX9-NEXT: global_store_byte v1, v2, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_uaddo_clamp_bit:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34
+; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v0, s1, s2, s3
+; GFX10-NEXT: s_cmp_eq_u32 s2, s3
+; GFX10-NEXT: s_cbranch_scc1 .LBB8_2
+; GFX10-NEXT: ; %bb.1: ; %if
+; GFX10-NEXT: s_xor_b32 s0, s1, -1
+; GFX10-NEXT: .LBB8_2: ; %exit
+; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dword v1, v0, s[8:9]
+; GFX10-NEXT: global_store_byte v1, v2, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_uaddo_clamp_bit:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x34
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v0, s1, s2, s3
+; GFX11-NEXT: s_cmp_eq_u32 s2, s3
+; GFX11-NEXT: s_cbranch_scc1 .LBB8_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: s_xor_b32 s0, s1, -1
+; GFX11-NEXT: .LBB8_2: ; %exit
+; GFX11-NEXT: s_load_b128 s[4:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT: global_store_b8 v1, v2, s[6:7]
+; GFX11-NEXT: s_endpgm
entry:
%uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %uadd, 0
@@ -711,19 +1015,19 @@ define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
; SI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s14, s2
-; SI-NEXT: s_mov_b32 s15, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s0, s8
; SI-NEXT: s_mov_b32 s1, s9
-; SI-NEXT: s_mov_b32 s12, s10
-; SI-NEXT: s_mov_b32 s13, s11
+; SI-NEXT: s_mov_b32 s8, s10
+; SI-NEXT: s_mov_b32 s9, s11
+; SI-NEXT: s_mov_b32 s10, s2
+; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
-; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0
+; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e64 v0, s[0:1], v1, v2
; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: v_add_i32_e64 v0, s[0:1], v1, v2
; SI-NEXT: s_cbranch_vccnz .LBB9_2
; SI-NEXT: ; %bb.1: ; %if
; SI-NEXT: s_xor_b64 s[8:9], s[0:1], -1
@@ -786,6 +1090,50 @@ define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
; GFX9-NEXT: global_store_byte v0, v1, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_uaddo_clamp_bit:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dword v1, v0, s[12:13]
+; GFX10-NEXT: global_load_dword v2, v0, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT: v_add_co_u32 v1, s1, v1, v2
+; GFX10-NEXT: s_cbranch_vccnz .LBB9_2
+; GFX10-NEXT: ; %bb.1: ; %if
+; GFX10-NEXT: s_xor_b32 s0, s1, -1
+; GFX10-NEXT: .LBB9_2: ; %exit
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT: global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_uaddo_clamp_bit:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT: v_add_co_u32 v1, s5, v1, v2
+; GFX11-NEXT: s_cbranch_vccnz .LBB9_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: s_xor_b32 s4, s5, -1
+; GFX11-NEXT: .LBB9_2: ; %exit
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -813,23 +1161,23 @@ exit:
define amdgpu_cs void @sv_uaddo_i128(ptr addrspace(1) %out, i128 inreg %a, i128 %b) {
; SI-LABEL: sv_uaddo_i128:
; SI: ; %bb.0:
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; SI-NEXT: v_mov_b32_e32 v6, s1
-; SI-NEXT: v_mov_b32_e32 v7, s2
-; SI-NEXT: v_mov_b32_e32 v8, s3
-; SI-NEXT: s_mov_b32 s4, s6
-; SI-NEXT: s_mov_b32 s5, s6
; SI-NEXT: v_addc_u32_e32 v3, vcc, v6, v3, vcc
-; SI-NEXT: v_addc_u32_e32 v4, vcc, v7, v4, vcc
-; SI-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], v[2:3]
-; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; SI-NEXT: v_addc_u32_e32 v5, vcc, v8, v5, vcc
+; SI-NEXT: v_mov_b32_e32 v6, s2
+; SI-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc
+; SI-NEXT: v_mov_b32_e32 v6, s3
+; SI-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc
+; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[2:3]
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[4:5]
+; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; SI-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5]
+; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; SI-NEXT: s_mov_b32 s5, s6
; SI-NEXT: v_and_b32_e32 v2, 1, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_endpgm
@@ -871,6 +1219,41 @@ define amdgpu_cs void @sv_uaddo_i128(ptr addrspace(1) %out, i128 inreg %a, i128
; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: sv_uaddo_i128:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, s0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v4, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v5, vcc_lo
+; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[2:3]
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[4:5]
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5]
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: sv_uaddo_i128:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, s0, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v4, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[4:5]
+; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[2:3]
+; GFX11-NEXT: v_mov_b16_e32 v2.l, v6.l
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b16 v2.l, v2.l, v3.l, vcc_lo
+; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
%uadd = call { i128, i1 } @llvm.uadd.with.overflow.i128(i128 %a, i128 %b)
%carry = extractvalue { i128, i1 } %uadd, 1
%carry.ext = zext i1 %carry to i32
diff --git a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
index d0d1ba8..b3166fa 100644
--- a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
+++ b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
@@ -8,9 +8,8 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; CHECK-NEXT: v_mov_b32_e32 v40, v0
-; CHECK-NEXT: v_pk_mov_b32 v[0:1], 0, 0
-; CHECK-NEXT: flat_load_dword v42, v[0:1]
+; CHECK-NEXT: v_pk_mov_b32 v[44:45], 0, 0
+; CHECK-NEXT: flat_load_dword v42, v[44:45]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x8
@@ -19,48 +18,44 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v46, s6
-; CHECK-NEXT: v_mov_b32_e32 v47, s7
+; CHECK-NEXT: v_accvgpr_write_b32 a32, s6
+; CHECK-NEXT: v_accvgpr_write_b32 a33, s7
; CHECK-NEXT: s_mov_b64 s[6:7], src_private_base
; CHECK-NEXT: s_cmp_lg_u32 s64, -1
; CHECK-NEXT: s_cselect_b32 s7, s7, 0
; CHECK-NEXT: s_cselect_b32 s8, s64, 0
; CHECK-NEXT: s_add_u32 s50, s34, 48
; CHECK-NEXT: s_addc_u32 s51, s35, 0
-; CHECK-NEXT: v_pk_mov_b32 v[58:59], s[4:5], s[4:5] op_sel:[0,1]
+; CHECK-NEXT: v_pk_mov_b32 v[56:57], s[4:5], s[4:5] op_sel:[0,1]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, G@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, G@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[54:55], s[4:5], 0x0
; CHECK-NEXT: s_mov_b32 s6, 0
-; CHECK-NEXT: v_pk_mov_b32 v[0:1], 0, 0
-; CHECK-NEXT: v_mov_b32_e32 v57, s7
+; CHECK-NEXT: v_mov_b32_e32 v47, s7
; CHECK-NEXT: s_mov_b32 s7, s6
; CHECK-NEXT: s_mov_b32 s53, s14
-; CHECK-NEXT: v_accvgpr_write_b32 a33, v1
-; CHECK-NEXT: v_mov_b32_e32 v56, s8
-; CHECK-NEXT: v_pk_mov_b32 v[60:61], s[6:7], s[6:7] op_sel:[0,1]
+; CHECK-NEXT: v_mov_b32_e32 v46, s8
+; CHECK-NEXT: v_pk_mov_b32 v[58:59], s[6:7], s[6:7] op_sel:[0,1]
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[50:51]
; CHECK-NEXT: s_mov_b32 s12, s14
; CHECK-NEXT: s_mov_b32 s13, s15
; CHECK-NEXT: s_mov_b32 s14, s16
-; CHECK-NEXT: v_mov_b32_e32 v31, v40
+; CHECK-NEXT: v_mov_b32_e32 v31, v0
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_mov_b32 s33, s16
; CHECK-NEXT: s_mov_b32 s52, s15
; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11]
-; CHECK-NEXT: v_accvgpr_write_b32 a32, v0
-; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[60:61]
+; CHECK-NEXT: v_mov_b32_e32 v40, v0
+; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[58:59]
; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55]
-; CHECK-NEXT: flat_load_dwordx2 v[62:63], v[58:59]
-; CHECK-NEXT: v_accvgpr_read_b32 v0, a32
-; CHECK-NEXT: v_mov_b32_e32 v44, 0
-; CHECK-NEXT: v_mov_b32_e32 v45, 0x3ff00000
-; CHECK-NEXT: v_accvgpr_read_b32 v1, a33
+; CHECK-NEXT: flat_load_dwordx2 v[60:61], v[56:57]
+; CHECK-NEXT: v_mov_b32_e32 v62, 0
+; CHECK-NEXT: v_mov_b32_e32 v63, 0x3ff00000
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[50:51]
@@ -69,20 +64,20 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
; CHECK-NEXT: s_mov_b32 s13, s52
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: v_mov_b32_e32 v31, v40
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[44:45]
-; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[60:61]
+; CHECK-NEXT: flat_store_dwordx2 v[44:45], v[62:63]
+; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[58:59]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15
; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55]
-; CHECK-NEXT: flat_load_dwordx2 v[0:1], v[56:57] glc
+; CHECK-NEXT: flat_load_dwordx2 v[0:1], v[46:47] glc
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s64
; CHECK-NEXT: v_cmp_lt_i32_e32 vcc, 0, v42
-; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[62:63]
+; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[60:61]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[46:47]
-; CHECK-NEXT: buffer_store_dword v47, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v44, v0, s[0:3], 0 offen
+; CHECK-NEXT: flat_store_dwordx2 v[56:57], a[32:33]
+; CHECK-NEXT: buffer_store_dword a33, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v62, v0, s[0:3], 0 offen
; CHECK-NEXT: ; implicit-def: $vgpr4
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index 7d7f1b4..0289dab 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -1,8 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
-
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=VI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s --check-prefix=GFX9
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s --check-prefix=GFX10
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s --check-prefix=GFX11
define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
; SI-LABEL: s_usubo_i64_zext:
@@ -13,14 +14,14 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_sub_u32 s0, s2, s8
; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_subb_u32 s1, s3, s9
+; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -62,6 +63,40 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_usubo_i64_zext:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_sub_u32 s4, s2, s6
+; GFX10-NEXT: s_subb_u32 s5, s3, s7
+; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], s[2:3]
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX10-NEXT: v_add_co_u32 v0, s2, s4, v0
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s5, 0, s2
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_usubo_i64_zext:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_sub_u32 s4, s2, s4
+; GFX11-NEXT: s_subb_u32 s5, s3, s5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], s[2:3]
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_u32 v0, s2, s4, v0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) #0
%val = extractvalue { i64, i1 } %usub, 0
%carry = extractvalue { i64, i1 } %usub, 1
@@ -76,21 +111,21 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-LABEL: s_usubo_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xd
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s10, s6
-; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: v_mov_b32_e32 v0, s9
; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: s_mov_b32 s8, s2
-; SI-NEXT: s_mov_b32 s9, s3
-; SI-NEXT: v_mov_b32_e32 v0, s13
-; SI-NEXT: v_sub_i32_e32 v0, vcc, s12, v0
+; SI-NEXT: v_sub_i32_e32 v0, vcc, s8, v0
+; SI-NEXT: s_mov_b32 s0, s2
+; SI-NEXT: s_mov_b32 s1, s3
+; SI-NEXT: s_mov_b32 s2, s6
+; SI-NEXT: s_mov_b32 s3, s7
; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0
+; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_usubo_i32:
@@ -121,6 +156,34 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: global_store_byte v0, v2, s[2:3]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_usubo_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_sub_co_u32 v1, s4, s6, s7
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_usubo_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v1, s4, s6, s7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: s_endpgm
%usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %usub, 0
%carry = extractvalue { i32, i1 } %usub, 1
@@ -137,17 +200,15 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
-; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -193,6 +254,38 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
; GFX9-NEXT: global_store_byte v0, v2, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_usubo_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dword v1, v0, s[12:13]
+; GFX10-NEXT: global_load_dword v2, v0, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_sub_co_u32 v1, s0, v1, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT: global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_usubo_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v1, s4, v1, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
@@ -215,17 +308,15 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
-; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -283,6 +374,45 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
; GFX9-NEXT: global_store_byte v0, v2, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_usubo_i32_novcc:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dword v1, v0, s[12:13]
+; GFX10-NEXT: global_load_dword v2, v0, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_sub_co_u32 v1, s0, v1, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_usubo_i32_novcc:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v1, s4, v1, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
@@ -306,21 +436,21 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_sub_u32 s6, s4, s6
-; SI-NEXT: s_subb_u32 s7, s5, s7
-; SI-NEXT: s_mov_b32 s14, s10
-; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s8, s0
-; SI-NEXT: s_mov_b32 s9, s1
-; SI-NEXT: s_mov_b32 s12, s2
-; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: s_subb_u32 s7, s5, s7
; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
; SI-NEXT: v_mov_b32_e32 v2, s6
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: s_mov_b32 s0, s2
+; SI-NEXT: s_mov_b32 s1, s3
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
; SI-NEXT: v_mov_b32_e32 v3, s7
-; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
-; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0
+; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
+; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_usubo_i64:
@@ -359,6 +489,37 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9]
; GFX9-NEXT: global_store_byte v4, v0, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_usubo_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_sub_u32 s0, s12, s14
+; GFX10-NEXT: s_subb_u32 s1, s13, s15
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], s[12:13]
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX10-NEXT: global_store_byte v2, v3, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_usubo_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_sub_u32 s6, s4, s6
+; GFX11-NEXT: s_subb_u32 s7, s5, s7
+; GFX11-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT: s_endpgm
%usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
%val = extractvalue { i64, i1 } %usub, 0
%carry = extractvalue { i64, i1 } %usub, 1
@@ -375,17 +536,15 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
-; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -393,8 +552,8 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_sub_i32_e32 v2, vcc, v0, v2
; SI-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
-; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
+; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
@@ -437,6 +596,42 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: global_store_byte v4, v0, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_usubo_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2
+; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9]
+; GFX10-NEXT: global_store_byte v4, v0, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_usubo_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5]
+; GFX11-NEXT: global_load_b64 v[2:3], v4, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-NEXT: global_store_b8 v4, v0, s[2:3]
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i64, ptr addrspace(1) %a.ptr
@@ -459,17 +654,15 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
-; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -477,8 +670,8 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0
-; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, v1, v0
+; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
@@ -522,6 +715,42 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_store_short v0, v2, s[8:9]
; GFX9-NEXT: global_store_byte v0, v1, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_usubo_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_ushort v1, v0, s[12:13]
+; GFX10-NEXT: global_load_ushort v2, v0, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_sub_nc_u32_e32 v2, v1, v2
+; GFX10-NEXT: v_cmp_gt_u32_sdwa s0, v2, v1 src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX10-NEXT: global_store_short v0, v2, s[8:9]
+; GFX10-NEXT: global_store_byte v0, v1, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_usubo_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_d16_b16 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_u16 v2, v0, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_sub_nc_u32_e32 v2, v1, v2
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, v3, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v1, s[2:3]
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr
@@ -544,17 +773,15 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
-; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -606,6 +833,42 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_usubo_v2i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_sub_co_u32 v1, s0, v1, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT: v_sub_co_u32 v0, s0, v0, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_usubo_v2i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5]
+; GFX11-NEXT: global_load_b64 v[2:3], v4, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v1, s4, v1, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT: v_sub_co_u32 v0, s4, v0, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v4, v[2:3], s[2:3]
+; GFX11-NEXT: s_endpgm
%a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
%b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
%sadd = call { <2 x i32>, <2 x i1> } @llvm.usub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
@@ -623,26 +886,27 @@ define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s1
-; SI-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
; SI-NEXT: s_cmp_eq_u32 s0, s1
+; SI-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_cbranch_scc1 .LBB8_2
; SI-NEXT: ; %bb.1: ; %if
; SI-NEXT: s_xor_b64 s[0:1], vcc, -1
; SI-NEXT: .LBB8_2: ; %exit
; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; SI-NEXT: s_mov_b32 s10, s2
-; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s0, s4
-; SI-NEXT: s_mov_b32 s1, s5
-; SI-NEXT: s_mov_b32 s8, s6
-; SI-NEXT: s_mov_b32 s9, s7
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
+; SI-NEXT: s_mov_b32 s6, s10
+; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_usubo_clamp_bit:
@@ -687,6 +951,45 @@ define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
; GFX9-NEXT: global_store_dword v1, v0, s[8:9]
; GFX9-NEXT: global_store_byte v1, v2, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_usubo_clamp_bit:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34
+; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_sub_co_u32 v0, s1, s2, s3
+; GFX10-NEXT: s_cmp_eq_u32 s2, s3
+; GFX10-NEXT: s_cbranch_scc1 .LBB8_2
+; GFX10-NEXT: ; %bb.1: ; %if
+; GFX10-NEXT: s_xor_b32 s0, s1, -1
+; GFX10-NEXT: .LBB8_2: ; %exit
+; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dword v1, v0, s[8:9]
+; GFX10-NEXT: global_store_byte v1, v2, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_usubo_clamp_bit:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x34
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v0, s1, s2, s3
+; GFX11-NEXT: s_cmp_eq_u32 s2, s3
+; GFX11-NEXT: s_cbranch_scc1 .LBB8_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: s_xor_b32 s0, s1, -1
+; GFX11-NEXT: .LBB8_2: ; %exit
+; GFX11-NEXT: s_load_b128 s[4:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT: global_store_b8 v1, v2, s[6:7]
+; GFX11-NEXT: s_endpgm
entry:
%usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %usub, 0
@@ -712,19 +1015,19 @@ define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
; SI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s14, s2
-; SI-NEXT: s_mov_b32 s15, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s0, s8
; SI-NEXT: s_mov_b32 s1, s9
-; SI-NEXT: s_mov_b32 s12, s10
-; SI-NEXT: s_mov_b32 s13, s11
+; SI-NEXT: s_mov_b32 s8, s10
+; SI-NEXT: s_mov_b32 s9, s11
+; SI-NEXT: s_mov_b32 s10, s2
+; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
-; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0
+; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_sub_i32_e64 v0, s[0:1], v1, v2
; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: v_sub_i32_e64 v0, s[0:1], v1, v2
; SI-NEXT: s_cbranch_vccnz .LBB9_2
; SI-NEXT: ; %bb.1: ; %if
; SI-NEXT: s_xor_b64 s[8:9], s[0:1], -1
@@ -787,6 +1090,50 @@ define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
; GFX9-NEXT: global_store_byte v0, v1, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_usubo_clamp_bit:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dword v1, v0, s[12:13]
+; GFX10-NEXT: global_load_dword v2, v0, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT: v_sub_co_u32 v1, s1, v1, v2
+; GFX10-NEXT: s_cbranch_vccnz .LBB9_2
+; GFX10-NEXT: ; %bb.1: ; %if
+; GFX10-NEXT: s_xor_b32 s0, s1, -1
+; GFX10-NEXT: .LBB9_2: ; %exit
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT: global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_usubo_clamp_bit:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT: v_sub_co_u32 v1, s5, v1, v2
+; GFX11-NEXT: s_cbranch_vccnz .LBB9_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: s_xor_b32 s4, s5, -1
+; GFX11-NEXT: .LBB9_2: ; %exit
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 2f25a93..fe7def8a 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -1961,16 +1961,15 @@ define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1)
; GFX942-LABEL: shuffle_v6f16_452367:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: global_load_dwordx3 v[0:2], v[6:7], off
-; GFX942-NEXT: global_load_dword v3, v[4:5], off
+; GFX942-NEXT: global_load_dwordx3 v[4:6], v[0:1], off
+; GFX942-NEXT: global_load_dword v4, v[2:3], off
+; GFX942-NEXT: ; kill: killed $vgpr0 killed $vgpr1
+; GFX942-NEXT: ; kill: killed $vgpr2 killed $vgpr3
; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v5
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: shuffle_v6f16_452367:
@@ -5151,16 +5150,15 @@ define <6 x bfloat> @shuffle_v6bf16_452367(ptr addrspace(1) %arg0, ptr addrspace
; GFX942-LABEL: shuffle_v6bf16_452367:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v7, v1
-; GFX942-NEXT: v_mov_b32_e32 v6, v0
-; GFX942-NEXT: v_mov_b32_e32 v5, v3
-; GFX942-NEXT: v_mov_b32_e32 v4, v2
-; GFX942-NEXT: global_load_dwordx3 v[0:2], v[6:7], off
-; GFX942-NEXT: global_load_dword v3, v[4:5], off
+; GFX942-NEXT: global_load_dwordx3 v[4:6], v[0:1], off
+; GFX942-NEXT: global_load_dword v4, v[2:3], off
+; GFX942-NEXT: ; kill: killed $vgpr0 killed $vgpr1
+; GFX942-NEXT: ; kill: killed $vgpr2 killed $vgpr3
; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v5
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, v3
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: shuffle_v6bf16_452367:
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index a401f989..d8264b5a 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -58,19 +58,19 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX942-NEXT: v_and_b32_e32 v3, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 2, v3
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v3
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dword v1, v2, s[0:1]
+; GFX942-NEXT: global_load_dword v2, v1, s[0:1]
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v3
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB1_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dword v1, v2, s[2:3]
+; GFX942-NEXT: global_load_dword v2, v1, s[2:3]
; GFX942-NEXT: .LBB1_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX942-NEXT: global_store_dword v0, v2, s[6:7]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -136,19 +136,19 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[0:1]
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB3_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[2:3]
; GFX942-NEXT: .LBB3_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[6:7]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -173,19 +173,19 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX942-NEXT: v_and_b32_e32 v6, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v5, 4, v6
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 4, v6
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx4 v[0:3], v5, s[0:1]
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[0:1]
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v6
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB4_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3]
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[2:3]
; GFX942-NEXT: .LBB4_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[6:7]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -210,23 +210,23 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX942-NEXT: v_and_b32_e32 v10, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v9, 5, v10
-; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 5, v10
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx4 v[4:7], v9, s[0:1] offset:16
-; GFX942-NEXT: global_load_dwordx4 v[0:3], v9, s[0:1]
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[0:1] offset:16
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[0:1]
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v10
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB5_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx4 v[4:7], v9, s[2:3] offset:16
-; GFX942-NEXT: global_load_dwordx4 v[0:3], v9, s[2:3]
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[2:3] offset:16
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[2:3]
; GFX942-NEXT: .LBB5_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] offset:16
; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7]
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[6:7]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -250,72 +250,72 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v2
+; GFX942-NEXT: v_and_b32_e32 v62, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v62
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx4 v[28:31], v1, s[0:1] offset:240
-; GFX942-NEXT: global_load_dwordx4 v[24:27], v1, s[0:1] offset:224
-; GFX942-NEXT: global_load_dwordx4 v[20:23], v1, s[0:1] offset:208
-; GFX942-NEXT: global_load_dwordx4 v[16:19], v1, s[0:1] offset:192
-; GFX942-NEXT: global_load_dwordx4 v[12:15], v1, s[0:1] offset:176
-; GFX942-NEXT: global_load_dwordx4 v[8:11], v1, s[0:1] offset:160
-; GFX942-NEXT: global_load_dwordx4 v[4:7], v1, s[0:1] offset:144
-; GFX942-NEXT: global_load_dwordx4 a[0:3], v1, s[0:1] offset:128
-; GFX942-NEXT: global_load_dwordx4 v[60:63], v1, s[0:1] offset:112
-; GFX942-NEXT: global_load_dwordx4 v[56:59], v1, s[0:1] offset:96
-; GFX942-NEXT: global_load_dwordx4 v[52:55], v1, s[0:1] offset:80
-; GFX942-NEXT: global_load_dwordx4 v[48:51], v1, s[0:1] offset:64
-; GFX942-NEXT: global_load_dwordx4 v[44:47], v1, s[0:1] offset:48
-; GFX942-NEXT: global_load_dwordx4 v[40:43], v1, s[0:1] offset:32
-; GFX942-NEXT: global_load_dwordx4 v[36:39], v1, s[0:1] offset:16
-; GFX942-NEXT: global_load_dwordx4 v[32:35], v1, s[0:1]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v2
+; GFX942-NEXT: global_load_dwordx4 v[30:33], v1, s[0:1] offset:240
+; GFX942-NEXT: global_load_dwordx4 v[26:29], v1, s[0:1] offset:224
+; GFX942-NEXT: global_load_dwordx4 v[22:25], v1, s[0:1] offset:208
+; GFX942-NEXT: global_load_dwordx4 v[18:21], v1, s[0:1] offset:192
+; GFX942-NEXT: global_load_dwordx4 v[14:17], v1, s[0:1] offset:176
+; GFX942-NEXT: global_load_dwordx4 v[10:13], v1, s[0:1] offset:160
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[0:1] offset:144
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[0:1] offset:128
+; GFX942-NEXT: global_load_dwordx4 a[0:3], v1, s[0:1] offset:112
+; GFX942-NEXT: global_load_dwordx4 v[58:61], v1, s[0:1] offset:96
+; GFX942-NEXT: global_load_dwordx4 v[54:57], v1, s[0:1] offset:80
+; GFX942-NEXT: global_load_dwordx4 v[50:53], v1, s[0:1] offset:64
+; GFX942-NEXT: global_load_dwordx4 v[46:49], v1, s[0:1] offset:48
+; GFX942-NEXT: global_load_dwordx4 v[42:45], v1, s[0:1] offset:32
+; GFX942-NEXT: global_load_dwordx4 v[38:41], v1, s[0:1] offset:16
+; GFX942-NEXT: global_load_dwordx4 v[34:37], v1, s[0:1]
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v62
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB6_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx4 v[28:31], v1, s[2:3] offset:240
-; GFX942-NEXT: global_load_dwordx4 v[24:27], v1, s[2:3] offset:224
-; GFX942-NEXT: global_load_dwordx4 v[20:23], v1, s[2:3] offset:208
-; GFX942-NEXT: global_load_dwordx4 v[16:19], v1, s[2:3] offset:192
-; GFX942-NEXT: global_load_dwordx4 v[12:15], v1, s[2:3] offset:176
-; GFX942-NEXT: global_load_dwordx4 v[8:11], v1, s[2:3] offset:160
-; GFX942-NEXT: global_load_dwordx4 v[4:7], v1, s[2:3] offset:144
-; GFX942-NEXT: global_load_dwordx4 a[0:3], v1, s[2:3] offset:128
-; GFX942-NEXT: global_load_dwordx4 v[60:63], v1, s[2:3] offset:112
-; GFX942-NEXT: global_load_dwordx4 v[56:59], v1, s[2:3] offset:96
-; GFX942-NEXT: global_load_dwordx4 v[52:55], v1, s[2:3] offset:80
-; GFX942-NEXT: global_load_dwordx4 v[48:51], v1, s[2:3] offset:64
-; GFX942-NEXT: global_load_dwordx4 v[44:47], v1, s[2:3] offset:48
-; GFX942-NEXT: global_load_dwordx4 v[40:43], v1, s[2:3] offset:32
-; GFX942-NEXT: global_load_dwordx4 v[36:39], v1, s[2:3] offset:16
-; GFX942-NEXT: global_load_dwordx4 v[32:35], v1, s[2:3]
+; GFX942-NEXT: global_load_dwordx4 v[30:33], v1, s[2:3] offset:240
+; GFX942-NEXT: global_load_dwordx4 v[26:29], v1, s[2:3] offset:224
+; GFX942-NEXT: global_load_dwordx4 v[22:25], v1, s[2:3] offset:208
+; GFX942-NEXT: global_load_dwordx4 v[18:21], v1, s[2:3] offset:192
+; GFX942-NEXT: global_load_dwordx4 v[14:17], v1, s[2:3] offset:176
+; GFX942-NEXT: global_load_dwordx4 v[10:13], v1, s[2:3] offset:160
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[2:3] offset:144
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[2:3] offset:128
+; GFX942-NEXT: global_load_dwordx4 a[0:3], v1, s[2:3] offset:112
+; GFX942-NEXT: global_load_dwordx4 v[58:61], v1, s[2:3] offset:96
+; GFX942-NEXT: global_load_dwordx4 v[54:57], v1, s[2:3] offset:80
+; GFX942-NEXT: global_load_dwordx4 v[50:53], v1, s[2:3] offset:64
+; GFX942-NEXT: global_load_dwordx4 v[46:49], v1, s[2:3] offset:48
+; GFX942-NEXT: global_load_dwordx4 v[42:45], v1, s[2:3] offset:32
+; GFX942-NEXT: global_load_dwordx4 v[38:41], v1, s[2:3] offset:16
+; GFX942-NEXT: global_load_dwordx4 v[34:37], v1, s[2:3]
; GFX942-NEXT: .LBB6_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[60:63], s[6:7] offset:112
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] offset:112
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[56:59], s[6:7] offset:96
+; GFX942-NEXT: global_store_dwordx4 v0, v[58:61], s[6:7] offset:96
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[52:55], s[6:7] offset:80
+; GFX942-NEXT: global_store_dwordx4 v0, v[54:57], s[6:7] offset:80
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[48:51], s[6:7] offset:64
+; GFX942-NEXT: global_store_dwordx4 v0, v[50:53], s[6:7] offset:64
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[44:47], s[6:7] offset:48
+; GFX942-NEXT: global_store_dwordx4 v0, v[46:49], s[6:7] offset:48
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[40:43], s[6:7] offset:32
+; GFX942-NEXT: global_store_dwordx4 v0, v[42:45], s[6:7] offset:32
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[36:39], s[6:7] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, v[38:41], s[6:7] offset:16
; GFX942-NEXT: s_waitcnt vmcnt(7)
-; GFX942-NEXT: global_store_dwordx4 v0, v[32:35], s[6:7]
-; GFX942-NEXT: global_store_dwordx4 v0, v[28:31], s[6:7] offset:240
-; GFX942-NEXT: global_store_dwordx4 v0, v[24:27], s[6:7] offset:224
-; GFX942-NEXT: global_store_dwordx4 v0, v[20:23], s[6:7] offset:208
-; GFX942-NEXT: global_store_dwordx4 v0, v[16:19], s[6:7] offset:192
-; GFX942-NEXT: global_store_dwordx4 v0, v[12:15], s[6:7] offset:176
-; GFX942-NEXT: global_store_dwordx4 v0, v[8:11], s[6:7] offset:160
-; GFX942-NEXT: global_store_dwordx4 v0, v[4:7], s[6:7] offset:144
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] offset:128
+; GFX942-NEXT: global_store_dwordx4 v0, v[34:37], s[6:7]
+; GFX942-NEXT: global_store_dwordx4 v0, v[30:33], s[6:7] offset:240
+; GFX942-NEXT: global_store_dwordx4 v0, v[26:29], s[6:7] offset:224
+; GFX942-NEXT: global_store_dwordx4 v0, v[22:25], s[6:7] offset:208
+; GFX942-NEXT: global_store_dwordx4 v0, v[18:21], s[6:7] offset:192
+; GFX942-NEXT: global_store_dwordx4 v0, v[14:17], s[6:7] offset:176
+; GFX942-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] offset:160
+; GFX942-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] offset:144
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[6:7] offset:128
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -391,17 +391,17 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(
; GFX942-LABEL: v8i8_phi_chain:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v2
-; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v2
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v2
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[8:9]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[8:9]
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX942-NEXT: s_cbranch_execz .LBB8_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[10:11]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v2
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[10:11]
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
; GFX942-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
@@ -410,14 +410,14 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
; GFX942-NEXT: s_cbranch_execz .LBB8_4
; GFX942-NEXT: ; %bb.3: ; %bb.2
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[12:13]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[12:13]
; GFX942-NEXT: .LBB8_4: ; %bb.3
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[14:15]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[14:15]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -447,38 +447,38 @@ define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspa
; GFX942-LABEL: v8i8_phi_zeroinit:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v5, 3, v4
-; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v4
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v5, s[8:9]
-; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[8:9]
+; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX942-NEXT: s_cbranch_execz .LBB9_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[2:3], v5, s[10:11]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v4
+; GFX942-NEXT: global_load_dwordx2 v[4:5], v1, s[10:11]
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
; GFX942-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX942-NEXT: .LBB9_2: ; %Flow
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
; GFX942-NEXT: s_cbranch_execz .LBB9_4
; GFX942-NEXT: ; %bb.3: ; %bb.2
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[12:13]
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[12:13]
; GFX942-NEXT: .LBB9_4: ; %bb.3
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[14:15]
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: global_store_dwordx2 v0, v[4:5], s[14:15]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -617,30 +617,30 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac
; GFX942-LABEL: v8i8_multi_block:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-NEXT: v_and_b32_e32 v5, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v6, 3, v5
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v5
+; GFX942-NEXT: v_and_b32_e32 v3, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v3
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v3
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[2:3], v6, s[8:9]
+; GFX942-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB11_4
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v6, s[10:11]
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v5
+; GFX942-NEXT: global_load_dwordx2 v[6:7], v4, s[10:11]
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v3
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX942-NEXT: s_cbranch_execz .LBB11_3
; GFX942-NEXT: ; %bb.2: ; %bb.2
-; GFX942-NEXT: v_mov_b32_e32 v5, 0
-; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[12:13]
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[12:13]
; GFX942-NEXT: .LBB11_3: ; %Flow
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: .LBB11_4: ; %bb.3
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[14:15]
+; GFX942-NEXT: global_store_dwordx2 v2, v[6:7], s[14:15]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -859,15 +859,15 @@ define amdgpu_kernel void @v8i8_mfma_i8(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[8:9]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[8:9]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB14_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[10:11]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[10:11]
; GFX942-NEXT: .LBB14_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[14:15], 0x0
@@ -878,9 +878,9 @@ define amdgpu_kernel void @v8i8_mfma_i8(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 6
-; GFX942-NEXT: global_store_dwordx4 v2, a[0:3], s[12:13]
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[12:13]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -909,15 +909,15 @@ define amdgpu_kernel void @v8i8_mfma_half(ptr addrspace(1) %src1, ptr addrspace(
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[36:43], s[4:5], 0x24
; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v4
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v4
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[36:37]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[36:37]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB15_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[38:39]
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[38:39]
; GFX942-NEXT: .LBB15_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_load_dwordx16 s[16:31], s[42:43], 0x0
@@ -957,18 +957,18 @@ define amdgpu_kernel void @v8i8_mfma_half(ptr addrspace(1) %src1, ptr addrspace(
; GFX942-NEXT: v_accvgpr_write_b32 a31, s15
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[0:1], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 2
-; GFX942-NEXT: global_store_dwordx4 v2, a[28:31], s[40:41] offset:112
-; GFX942-NEXT: global_store_dwordx4 v2, a[24:27], s[40:41] offset:96
-; GFX942-NEXT: global_store_dwordx4 v2, a[20:23], s[40:41] offset:80
-; GFX942-NEXT: global_store_dwordx4 v2, a[16:19], s[40:41] offset:64
-; GFX942-NEXT: global_store_dwordx4 v2, a[12:15], s[40:41] offset:48
-; GFX942-NEXT: global_store_dwordx4 v2, a[8:11], s[40:41] offset:32
-; GFX942-NEXT: global_store_dwordx4 v2, a[4:7], s[40:41] offset:16
-; GFX942-NEXT: global_store_dwordx4 v2, a[0:3], s[40:41]
+; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[40:41] offset:112
+; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[40:41] offset:96
+; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[40:41] offset:80
+; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[40:41] offset:64
+; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[40:41] offset:48
+; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[40:41] offset:32
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[40:41] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[40:41]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-regalloc-error.ll b/llvm/test/CodeGen/AMDGPU/wwm-regalloc-error.ll
index 145f1e4..ff18b32 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-regalloc-error.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-regalloc-error.ll
@@ -2,7 +2,7 @@
; A negative test to capture the expected error when the VGPRs are insufficient for wwm-regalloc.
-; CHECK: error: can't find enough VGPRs for wwm-regalloc
+; CHECK: error: cannot find enough VGPRs for wwm-regalloc
define amdgpu_kernel void @test(i32 %in) {
entry: