aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir2772
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir2496
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll891
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir40
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir121
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll3195
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptrunc.ll1173
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll264
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll1994
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll292
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll630
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll123
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll340
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/ssubo.ll676
-rw-r--r--llvm/test/CodeGen/AMDGPU/uaddo.ll553
-rw-r--r--llvm/test/CodeGen/AMDGPU/usubo.ll499
21 files changed, 11006 insertions, 5090 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir
index 789385d..b770d43 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir
@@ -1,12 +1,8 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner %s -o - | FileCheck -check-prefix=GFX9 %s
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner -fp-contract=fast %s -o - | FileCheck -check-prefix=GFX9-CONTRACT %s
# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner --denormal-fp-math=preserve-sign %s -o - | FileCheck -check-prefix=GFX9-DENORM %s
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner -enable-unsafe-fp-math %s -o - | FileCheck -check-prefix=GFX9-UNSAFE %s
# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner %s -o - | FileCheck -check-prefix=GFX10 %s
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -fp-contract=fast %s -o - | FileCheck -check-prefix=GFX10-CONTRACT %s
# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner --denormal-fp-math=preserve-sign %s -o - | FileCheck -check-prefix=GFX10-DENORM %s
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -enable-unsafe-fp-math %s -o - | FileCheck -check-prefix=GFX10-UNSAFE %s
---
name: test_f32_add_mul
@@ -24,15 +20,7 @@ body: |
; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[COPY2]]
; GFX9-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX9-CONTRACT-LABEL: name: test_f32_add_mul
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
; GFX9-DENORM-LABEL: name: test_f32_add_mul
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX9-DENORM-NEXT: {{ $}}
@@ -43,15 +31,7 @@ body: |
; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[COPY2]]
; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX9-UNSAFE-LABEL: name: test_f32_add_mul
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
; GFX10-LABEL: name: test_f32_add_mul
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-NEXT: {{ $}}
@@ -62,15 +42,7 @@ body: |
; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[COPY2]]
; GFX10-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX10-CONTRACT-LABEL: name: test_f32_add_mul
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
; GFX10-DENORM-LABEL: name: test_f32_add_mul
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-DENORM-NEXT: {{ $}}
@@ -81,15 +53,6 @@ body: |
; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[COPY2]]
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX10-UNSAFE-LABEL: name: test_f32_add_mul
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s32) = COPY $vgpr2
@@ -100,6 +63,60 @@ body: |
...
---
+name: test_f32_add_mul_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; GFX9-LABEL: name: test_f32_add_mul_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX9-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX9-DENORM-LABEL: name: test_f32_add_mul_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-LABEL: name: test_f32_add_mul_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-DENORM-LABEL: name: test_f32_add_mul_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(s32) = COPY $vgpr2
+ %4:_(s32) = contract G_FMUL %0, %1
+ %5:_(s32) = contract G_FADD %4, %2
+ $vgpr0 = COPY %5(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+...
+
+---
name: test_f32_add_mul_rhs
body: |
bb.1.entry:
@@ -115,15 +132,7 @@ body: |
; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY2]], [[FMUL]]
; GFX9-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX9-CONTRACT-LABEL: name: test_f32_add_mul_rhs
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
; GFX9-DENORM-LABEL: name: test_f32_add_mul_rhs
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX9-DENORM-NEXT: {{ $}}
@@ -134,15 +143,7 @@ body: |
; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY2]], [[FMUL]]
; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX9-UNSAFE-LABEL: name: test_f32_add_mul_rhs
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
; GFX10-LABEL: name: test_f32_add_mul_rhs
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-NEXT: {{ $}}
@@ -153,15 +154,7 @@ body: |
; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY2]], [[FMUL]]
; GFX10-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX10-CONTRACT-LABEL: name: test_f32_add_mul_rhs
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
; GFX10-DENORM-LABEL: name: test_f32_add_mul_rhs
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-DENORM-NEXT: {{ $}}
@@ -172,15 +165,6 @@ body: |
; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY2]], [[FMUL]]
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX10-UNSAFE-LABEL: name: test_f32_add_mul_rhs
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s32) = COPY $vgpr2
@@ -191,6 +175,60 @@ body: |
...
---
+name: test_f32_add_mul_rhs_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; GFX9-LABEL: name: test_f32_add_mul_rhs_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX9-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX9-DENORM-LABEL: name: test_f32_add_mul_rhs_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-LABEL: name: test_f32_add_mul_rhs_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-DENORM-LABEL: name: test_f32_add_mul_rhs_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(s32) = COPY $vgpr2
+ %4:_(s32) = contract G_FMUL %0, %1
+ %5:_(s32) = contract G_FADD %2, %4
+ $vgpr0 = COPY %5(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+...
+
+---
name: test_add_mul_multiple_defs_z
body: |
bb.1.entry:
@@ -209,18 +247,7 @@ body: |
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV1]]
; GFX9-NEXT: $vgpr0 = COPY [[FADD]](s32)
- ; GFX9-CONTRACT-LABEL: name: test_add_mul_multiple_defs_z
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-CONTRACT-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ;
; GFX9-DENORM-LABEL: name: test_add_mul_multiple_defs_z
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GFX9-DENORM-NEXT: {{ $}}
@@ -234,18 +261,7 @@ body: |
; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV1]]
; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
- ; GFX9-UNSAFE-LABEL: name: test_add_mul_multiple_defs_z
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-UNSAFE-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ;
; GFX10-LABEL: name: test_add_mul_multiple_defs_z
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GFX10-NEXT: {{ $}}
@@ -259,18 +275,7 @@ body: |
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV1]]
; GFX10-NEXT: $vgpr0 = COPY [[FADD]](s32)
- ; GFX10-CONTRACT-LABEL: name: test_add_mul_multiple_defs_z
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-CONTRACT-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ;
; GFX10-DENORM-LABEL: name: test_add_mul_multiple_defs_z
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GFX10-DENORM-NEXT: {{ $}}
@@ -284,18 +289,6 @@ body: |
; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV1]]
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
- ; GFX10-UNSAFE-LABEL: name: test_add_mul_multiple_defs_z
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-UNSAFE-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%4:_(s32) = COPY $vgpr2
@@ -310,6 +303,76 @@ body: |
...
---
+name: test_add_mul_multiple_defs_z_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+ ; GFX9-LABEL: name: test_add_mul_multiple_defs_z_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
+ ; GFX9-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ;
+ ; GFX9-DENORM-LABEL: name: test_add_mul_multiple_defs_z_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-DENORM-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ;
+ ; GFX10-LABEL: name: test_add_mul_multiple_defs_z_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
+ ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ;
+ ; GFX10-DENORM-LABEL: name: test_add_mul_multiple_defs_z_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-DENORM-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %4:_(s32) = COPY $vgpr2
+ %5:_(s32) = COPY $vgpr3
+ %2:_(p1) = G_MERGE_VALUES %4(s32), %5(s32)
+ %6:_(s32) = contract G_FMUL %0, %1
+ %7:_(<2 x s32>) = G_LOAD %2(p1) :: (load (<2 x s32>), addrspace 1)
+ %12:_(s32), %13:_(s32) = G_UNMERGE_VALUES %7(<2 x s32>)
+ %8:_(s32) = COPY %13(s32)
+ %10:_(s32) = contract G_FADD %6, %8
+ $vgpr0 = COPY %10(s32)
+...
+
+---
name: test_add_mul_rhs_multiple_defs_z
body: |
bb.1.entry:
@@ -328,18 +391,7 @@ body: |
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[UV1]], [[FMUL]]
; GFX9-NEXT: $vgpr0 = COPY [[FADD]](s32)
- ; GFX9-CONTRACT-LABEL: name: test_add_mul_rhs_multiple_defs_z
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-CONTRACT-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ;
; GFX9-DENORM-LABEL: name: test_add_mul_rhs_multiple_defs_z
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GFX9-DENORM-NEXT: {{ $}}
@@ -353,18 +405,7 @@ body: |
; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[UV1]], [[FMUL]]
; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
- ; GFX9-UNSAFE-LABEL: name: test_add_mul_rhs_multiple_defs_z
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-UNSAFE-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ;
; GFX10-LABEL: name: test_add_mul_rhs_multiple_defs_z
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GFX10-NEXT: {{ $}}
@@ -378,18 +419,7 @@ body: |
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[UV1]], [[FMUL]]
; GFX10-NEXT: $vgpr0 = COPY [[FADD]](s32)
- ; GFX10-CONTRACT-LABEL: name: test_add_mul_rhs_multiple_defs_z
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-CONTRACT-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ;
; GFX10-DENORM-LABEL: name: test_add_mul_rhs_multiple_defs_z
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GFX10-DENORM-NEXT: {{ $}}
@@ -403,18 +433,6 @@ body: |
; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[UV1]], [[FMUL]]
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
- ; GFX10-UNSAFE-LABEL: name: test_add_mul_rhs_multiple_defs_z
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-UNSAFE-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%4:_(s32) = COPY $vgpr2
@@ -429,6 +447,76 @@ body: |
...
---
+name: test_add_mul_rhs_multiple_defs_z_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+ ; GFX9-LABEL: name: test_add_mul_rhs_multiple_defs_z_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
+ ; GFX9-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ;
+ ; GFX9-DENORM-LABEL: name: test_add_mul_rhs_multiple_defs_z_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-DENORM-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ;
+ ; GFX10-LABEL: name: test_add_mul_rhs_multiple_defs_z_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
+ ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ;
+ ; GFX10-DENORM-LABEL: name: test_add_mul_rhs_multiple_defs_z_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-DENORM-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %4:_(s32) = COPY $vgpr2
+ %5:_(s32) = COPY $vgpr3
+ %2:_(p1) = G_MERGE_VALUES %4(s32), %5(s32)
+ %6:_(s32) = contract G_FMUL %0, %1
+ %7:_(<2 x s32>) = G_LOAD %2(p1) :: (load (<2 x s32>), addrspace 1)
+ %12:_(s32), %13:_(s32) = G_UNMERGE_VALUES %7(<2 x s32>)
+ %8:_(s32) = COPY %13(s32)
+ %10:_(s32) = contract G_FADD %8, %6
+ $vgpr0 = COPY %10(s32)
+...
+
+---
name: test_half_add_mul
body: |
bb.1.entry:
@@ -448,19 +536,7 @@ body: |
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX9-CONTRACT-LABEL: name: test_half_add_mul
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX9-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
; GFX9-DENORM-LABEL: name: test_half_add_mul
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX9-DENORM-NEXT: {{ $}}
@@ -475,19 +551,7 @@ body: |
; GFX9-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX9-UNSAFE-LABEL: name: test_half_add_mul
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
; GFX10-LABEL: name: test_half_add_mul
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-NEXT: {{ $}}
@@ -502,19 +566,7 @@ body: |
; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX10-CONTRACT-LABEL: name: test_half_add_mul
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX10-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
; GFX10-DENORM-LABEL: name: test_half_add_mul
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-DENORM-NEXT: {{ $}}
@@ -529,19 +581,6 @@ body: |
; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX10-UNSAFE-LABEL: name: test_half_add_mul
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX10-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
%4:_(s32) = COPY $vgpr0
%0:_(s16) = G_TRUNC %4(s32)
%5:_(s32) = COPY $vgpr1
@@ -556,6 +595,80 @@ body: |
...
---
+name: test_half_add_mul_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; GFX9-LABEL: name: test_half_add_mul_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+ ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+ ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX9-DENORM-LABEL: name: test_half_add_mul_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+ ; GFX9-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-LABEL: name: test_half_add_mul_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+ ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+ ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-DENORM-LABEL: name: test_half_add_mul_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+ ; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ %4:_(s32) = COPY $vgpr0
+ %0:_(s16) = G_TRUNC %4(s32)
+ %5:_(s32) = COPY $vgpr1
+ %1:_(s16) = G_TRUNC %5(s32)
+ %6:_(s32) = COPY $vgpr2
+ %2:_(s16) = G_TRUNC %6(s32)
+ %7:_(s16) = contract G_FMUL %0, %1
+ %8:_(s16) = contract G_FADD %7, %2
+ %10:_(s32) = G_ANYEXT %8(s16)
+ $vgpr0 = COPY %10(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+...
+
+---
name: test_half_add_mul_rhs
body: |
bb.1.entry:
@@ -575,19 +688,7 @@ body: |
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX9-CONTRACT-LABEL: name: test_half_add_mul_rhs
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX9-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
; GFX9-DENORM-LABEL: name: test_half_add_mul_rhs
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX9-DENORM-NEXT: {{ $}}
@@ -602,19 +703,7 @@ body: |
; GFX9-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX9-UNSAFE-LABEL: name: test_half_add_mul_rhs
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
; GFX10-LABEL: name: test_half_add_mul_rhs
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-NEXT: {{ $}}
@@ -629,19 +718,7 @@ body: |
; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX10-CONTRACT-LABEL: name: test_half_add_mul_rhs
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX10-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
; GFX10-DENORM-LABEL: name: test_half_add_mul_rhs
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-DENORM-NEXT: {{ $}}
@@ -656,19 +733,6 @@ body: |
; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ; GFX10-UNSAFE-LABEL: name: test_half_add_mul_rhs
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX10-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
%4:_(s32) = COPY $vgpr0
%0:_(s16) = G_TRUNC %4(s32)
%5:_(s32) = COPY $vgpr1
@@ -683,6 +747,80 @@ body: |
...
---
+name: test_half_add_mul_rhs_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; GFX9-LABEL: name: test_half_add_mul_rhs_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+ ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+ ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX9-DENORM-LABEL: name: test_half_add_mul_rhs_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+ ; GFX9-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-LABEL: name: test_half_add_mul_rhs_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+ ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+ ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-DENORM-LABEL: name: test_half_add_mul_rhs_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+ ; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ %4:_(s32) = COPY $vgpr0
+ %0:_(s16) = G_TRUNC %4(s32)
+ %5:_(s32) = COPY $vgpr1
+ %1:_(s16) = G_TRUNC %5(s32)
+ %6:_(s32) = COPY $vgpr2
+ %2:_(s16) = G_TRUNC %6(s32)
+ %7:_(s16) = contract G_FMUL %0, %1
+ %8:_(s16) = contract G_FADD %2, %7
+ %10:_(s32) = G_ANYEXT %8(s16)
+ $vgpr0 = COPY %10(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+...
+
+---
name: test_double_add_mul
body: |
bb.1.entry:
@@ -706,23 +844,7 @@ body: |
; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ; GFX9-CONTRACT-LABEL: name: test_double_add_mul
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
; GFX9-DENORM-LABEL: name: test_double_add_mul
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX9-DENORM-NEXT: {{ $}}
@@ -741,23 +863,7 @@ body: |
; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ; GFX9-UNSAFE-LABEL: name: test_double_add_mul
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
; GFX10-LABEL: name: test_double_add_mul
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-NEXT: {{ $}}
@@ -776,23 +882,7 @@ body: |
; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ; GFX10-CONTRACT-LABEL: name: test_double_add_mul
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
; GFX10-DENORM-LABEL: name: test_double_add_mul
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-DENORM-NEXT: {{ $}}
@@ -811,23 +901,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ; GFX10-UNSAFE-LABEL: name: test_double_add_mul
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
%4:_(s32) = COPY $vgpr0
%5:_(s32) = COPY $vgpr1
%0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
@@ -846,6 +919,101 @@ body: |
...
---
+name: test_double_add_mul_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; GFX9-LABEL: name: test_double_add_mul_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX9-DENORM-LABEL: name: test_double_add_mul_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-LABEL: name: test_double_add_mul_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-DENORM-LABEL: name: test_double_add_mul_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ %4:_(s32) = COPY $vgpr0
+ %5:_(s32) = COPY $vgpr1
+ %0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+ %6:_(s32) = COPY $vgpr2
+ %7:_(s32) = COPY $vgpr3
+ %1:_(s64) = G_MERGE_VALUES %6(s32), %7(s32)
+ %8:_(s32) = COPY $vgpr4
+ %9:_(s32) = COPY $vgpr5
+ %2:_(s64) = G_MERGE_VALUES %8(s32), %9(s32)
+ %10:_(s64) = contract G_FMUL %0, %1
+ %11:_(s64) = contract G_FADD %10, %2
+ %13:_(s32), %14:_(s32) = G_UNMERGE_VALUES %11(s64)
+ $vgpr0 = COPY %13(s32)
+ $vgpr1 = COPY %14(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
+
+
+---
name: test_double_add_mul_rhs
body: |
bb.1.entry:
@@ -869,23 +1037,7 @@ body: |
; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ; GFX9-CONTRACT-LABEL: name: test_double_add_mul_rhs
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
; GFX9-DENORM-LABEL: name: test_double_add_mul_rhs
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX9-DENORM-NEXT: {{ $}}
@@ -904,23 +1056,7 @@ body: |
; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ; GFX9-UNSAFE-LABEL: name: test_double_add_mul_rhs
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
; GFX10-LABEL: name: test_double_add_mul_rhs
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-NEXT: {{ $}}
@@ -939,23 +1075,7 @@ body: |
; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ; GFX10-CONTRACT-LABEL: name: test_double_add_mul_rhs
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
; GFX10-DENORM-LABEL: name: test_double_add_mul_rhs
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-DENORM-NEXT: {{ $}}
@@ -974,23 +1094,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ; GFX10-UNSAFE-LABEL: name: test_double_add_mul_rhs
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
%4:_(s32) = COPY $vgpr0
%5:_(s32) = COPY $vgpr1
%0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
@@ -1009,6 +1112,100 @@ body: |
...
---
+name: test_double_add_mul_rhs_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; GFX9-LABEL: name: test_double_add_mul_rhs_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX9-DENORM-LABEL: name: test_double_add_mul_rhs_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-LABEL: name: test_double_add_mul_rhs_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-DENORM-LABEL: name: test_double_add_mul_rhs_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ %4:_(s32) = COPY $vgpr0
+ %5:_(s32) = COPY $vgpr1
+ %0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+ %6:_(s32) = COPY $vgpr2
+ %7:_(s32) = COPY $vgpr3
+ %1:_(s64) = G_MERGE_VALUES %6(s32), %7(s32)
+ %8:_(s32) = COPY $vgpr4
+ %9:_(s32) = COPY $vgpr5
+ %2:_(s64) = G_MERGE_VALUES %8(s32), %9(s32)
+ %10:_(s64) = contract G_FMUL %0, %1
+ %11:_(s64) = contract G_FADD %2, %10
+ %13:_(s32), %14:_(s32) = G_UNMERGE_VALUES %11(s64)
+ $vgpr0 = COPY %13(s32)
+ $vgpr1 = COPY %14(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
+
+---
name: test_4xfloat_add_mul
body: |
bb.1.entry:
@@ -1040,32 +1237,7 @@ body: |
; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
- ; GFX9-CONTRACT-LABEL: name: test_4xfloat_add_mul
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX9-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX9-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
- ; GFX9-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX9-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ;
; GFX9-DENORM-LABEL: name: test_4xfloat_add_mul
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
; GFX9-DENORM-NEXT: {{ $}}
@@ -1092,32 +1264,7 @@ body: |
; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
- ; GFX9-UNSAFE-LABEL: name: test_4xfloat_add_mul
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX9-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX9-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
- ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX9-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ;
; GFX10-LABEL: name: test_4xfloat_add_mul
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
; GFX10-NEXT: {{ $}}
@@ -1144,32 +1291,7 @@ body: |
; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
- ; GFX10-CONTRACT-LABEL: name: test_4xfloat_add_mul
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX10-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX10-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
- ; GFX10-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX10-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ;
; GFX10-DENORM-LABEL: name: test_4xfloat_add_mul
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
; GFX10-DENORM-NEXT: {{ $}}
@@ -1196,32 +1318,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
- ; GFX10-UNSAFE-LABEL: name: test_4xfloat_add_mul
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX10-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX10-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
- ; GFX10-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX10-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%4:_(s32) = COPY $vgpr0
%5:_(s32) = COPY $vgpr1
%6:_(s32) = COPY $vgpr2
@@ -1248,6 +1344,144 @@ body: |
...
---
+name: test_4xfloat_add_mul_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+
+ ; GFX9-LABEL: name: test_4xfloat_add_mul_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]]
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ;
+ ; GFX9-DENORM-LABEL: name: test_4xfloat_add_mul_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX9-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX9-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]]
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ;
+ ; GFX10-LABEL: name: test_4xfloat_add_mul_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]]
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ;
+ ; GFX10-DENORM-LABEL: name: test_4xfloat_add_mul_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX10-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX10-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]]
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ %4:_(s32) = COPY $vgpr0
+ %5:_(s32) = COPY $vgpr1
+ %6:_(s32) = COPY $vgpr2
+ %7:_(s32) = COPY $vgpr3
+ %0:_(<4 x s32>) = G_BUILD_VECTOR %4(s32), %5(s32), %6(s32), %7(s32)
+ %8:_(s32) = COPY $vgpr4
+ %9:_(s32) = COPY $vgpr5
+ %10:_(s32) = COPY $vgpr6
+ %11:_(s32) = COPY $vgpr7
+ %1:_(<4 x s32>) = G_BUILD_VECTOR %8(s32), %9(s32), %10(s32), %11(s32)
+ %12:_(s32) = COPY $vgpr8
+ %13:_(s32) = COPY $vgpr9
+ %14:_(s32) = COPY $vgpr10
+ %15:_(s32) = COPY $vgpr11
+ %2:_(<4 x s32>) = G_BUILD_VECTOR %12(s32), %13(s32), %14(s32), %15(s32)
+ %16:_(<4 x s32>) = contract G_FMUL %0, %1
+ %17:_(<4 x s32>) = contract G_FADD %16, %2
+ %19:_(s32), %20:_(s32), %21:_(s32), %22:_(s32) = G_UNMERGE_VALUES %17(<4 x s32>)
+ $vgpr0 = COPY %19(s32)
+ $vgpr1 = COPY %20(s32)
+ $vgpr2 = COPY %21(s32)
+ $vgpr3 = COPY %22(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+...
+
+---
name: test_3xfloat_add_mul_rhs
body: |
bb.1.entry:
@@ -1275,28 +1509,7 @@ body: |
; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
- ; GFX9-CONTRACT-LABEL: name: test_3xfloat_add_mul_rhs
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
- ; GFX9-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX9-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ ;
; GFX9-DENORM-LABEL: name: test_3xfloat_add_mul_rhs
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; GFX9-DENORM-NEXT: {{ $}}
@@ -1319,28 +1532,7 @@ body: |
; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
- ; GFX9-UNSAFE-LABEL: name: test_3xfloat_add_mul_rhs
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
- ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX9-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ ;
; GFX10-LABEL: name: test_3xfloat_add_mul_rhs
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; GFX10-NEXT: {{ $}}
@@ -1363,28 +1555,7 @@ body: |
; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
- ; GFX10-CONTRACT-LABEL: name: test_3xfloat_add_mul_rhs
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
- ; GFX10-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX10-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ ;
; GFX10-DENORM-LABEL: name: test_3xfloat_add_mul_rhs
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; GFX10-DENORM-NEXT: {{ $}}
@@ -1407,28 +1578,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
- ; GFX10-UNSAFE-LABEL: name: test_3xfloat_add_mul_rhs
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
- ; GFX10-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX10-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%4:_(s32) = COPY $vgpr0
%5:_(s32) = COPY $vgpr1
%6:_(s32) = COPY $vgpr2
@@ -1451,6 +1600,124 @@ body: |
...
---
+name: test_3xfloat_add_mul_rhs_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+
+ ; GFX9-LABEL: name: test_3xfloat_add_mul_rhs_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+ ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ ;
+ ; GFX9-DENORM-LABEL: name: test_3xfloat_add_mul_rhs_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+ ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ ;
+ ; GFX10-LABEL: name: test_3xfloat_add_mul_rhs_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+ ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ ;
+ ; GFX10-DENORM-LABEL: name: test_3xfloat_add_mul_rhs_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+ ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ %4:_(s32) = COPY $vgpr0
+ %5:_(s32) = COPY $vgpr1
+ %6:_(s32) = COPY $vgpr2
+ %0:_(<3 x s32>) = G_BUILD_VECTOR %4(s32), %5(s32), %6(s32)
+ %7:_(s32) = COPY $vgpr3
+ %8:_(s32) = COPY $vgpr4
+ %9:_(s32) = COPY $vgpr5
+ %1:_(<3 x s32>) = G_BUILD_VECTOR %7(s32), %8(s32), %9(s32)
+ %10:_(s32) = COPY $vgpr6
+ %11:_(s32) = COPY $vgpr7
+ %12:_(s32) = COPY $vgpr8
+ %2:_(<3 x s32>) = G_BUILD_VECTOR %10(s32), %11(s32), %12(s32)
+ %13:_(<3 x s32>) = contract G_FMUL %0, %1
+ %14:_(<3 x s32>) = contract G_FADD %2, %13
+ %16:_(s32), %17:_(s32), %18:_(s32) = G_UNMERGE_VALUES %14(<3 x s32>)
+ $vgpr0 = COPY %16(s32)
+ $vgpr1 = COPY %17(s32)
+ $vgpr2 = COPY %18(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+...
+
+---
name: test_4xhalf_add_mul
body: |
bb.1.entry:
@@ -1474,24 +1741,7 @@ body: |
; GFX9-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
; GFX9-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ; GFX9-CONTRACT-LABEL: name: test_4xhalf_add_mul
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
- ; GFX9-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
; GFX9-DENORM-LABEL: name: test_4xhalf_add_mul
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX9-DENORM-NEXT: {{ $}}
@@ -1510,24 +1760,7 @@ body: |
; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ; GFX9-UNSAFE-LABEL: name: test_4xhalf_add_mul
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
- ; GFX9-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
; GFX10-LABEL: name: test_4xhalf_add_mul
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-NEXT: {{ $}}
@@ -1546,24 +1779,7 @@ body: |
; GFX10-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
; GFX10-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ; GFX10-CONTRACT-LABEL: name: test_4xhalf_add_mul
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
- ; GFX10-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
; GFX10-DENORM-LABEL: name: test_4xhalf_add_mul
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-DENORM-NEXT: {{ $}}
@@ -1582,24 +1798,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ; GFX10-UNSAFE-LABEL: name: test_4xhalf_add_mul
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
- ; GFX10-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
%4:_(<2 x s16>) = COPY $vgpr0
%5:_(<2 x s16>) = COPY $vgpr1
%0:_(<4 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>)
@@ -1618,6 +1816,105 @@ body: |
...
---
+name: test_4xhalf_add_mul_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; GFX9-LABEL: name: test_4xhalf_add_mul_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX9-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX9-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+ ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = contract G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
+ ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = contract G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX9-DENORM-LABEL: name: test_4xhalf_add_mul_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = contract G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
+ ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = contract G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-LABEL: name: test_4xhalf_add_mul_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX10-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX10-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+ ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = contract G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
+ ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = contract G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-DENORM-LABEL: name: test_4xhalf_add_mul_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = contract G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
+ ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = contract G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ %4:_(<2 x s16>) = COPY $vgpr0
+ %5:_(<2 x s16>) = COPY $vgpr1
+ %0:_(<4 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>)
+ %6:_(<2 x s16>) = COPY $vgpr2
+ %7:_(<2 x s16>) = COPY $vgpr3
+ %1:_(<4 x s16>) = G_CONCAT_VECTORS %6(<2 x s16>), %7(<2 x s16>)
+ %8:_(<2 x s16>) = COPY $vgpr4
+ %9:_(<2 x s16>) = COPY $vgpr5
+ %2:_(<4 x s16>) = G_CONCAT_VECTORS %8(<2 x s16>), %9(<2 x s16>)
+ %10:_(<4 x s16>) = contract G_FMUL %0, %1
+ %11:_(<4 x s16>) = contract G_FADD %10, %2
+ %13:_(<2 x s16>), %14:_(<2 x s16>) = G_UNMERGE_VALUES %11(<4 x s16>)
+ $vgpr0 = COPY %13(<2 x s16>)
+ $vgpr1 = COPY %14(<2 x s16>)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
+
+
+---
name: test_3xhalf_add_mul_rhs
body: |
bb.1.entry:
@@ -1648,31 +1945,6 @@ body: |
; GFX9-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX9-CONTRACT-LABEL: name: test_3xhalf_add_mul_rhs
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
- ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
- ; GFX9-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = G_FMUL [[UV]], [[UV2]]
- ; GFX9-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = G_FADD [[UV4]], [[FMUL]]
- ; GFX9-CONTRACT-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
- ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>)
- ; GFX9-CONTRACT-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX9-DENORM-LABEL: name: test_3xhalf_add_mul_rhs
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX9-DENORM-NEXT: {{ $}}
@@ -1698,31 +1970,6 @@ body: |
; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX9-UNSAFE-LABEL: name: test_3xhalf_add_mul_rhs
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
- ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
- ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = G_FMUL [[UV]], [[UV2]]
- ; GFX9-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = G_FADD [[UV4]], [[FMUL]]
- ; GFX9-UNSAFE-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
- ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>)
- ; GFX9-UNSAFE-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX10-LABEL: name: test_3xhalf_add_mul_rhs
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-NEXT: {{ $}}
@@ -1748,31 +1995,6 @@ body: |
; GFX10-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX10-CONTRACT-LABEL: name: test_3xhalf_add_mul_rhs
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
- ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
- ; GFX10-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = G_FMUL [[UV]], [[UV2]]
- ; GFX10-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = G_FADD [[UV4]], [[FMUL]]
- ; GFX10-CONTRACT-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
- ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>)
- ; GFX10-CONTRACT-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX10-DENORM-LABEL: name: test_3xhalf_add_mul_rhs
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-DENORM-NEXT: {{ $}}
@@ -1797,31 +2019,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
- ; GFX10-UNSAFE-LABEL: name: test_3xhalf_add_mul_rhs
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
- ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
- ; GFX10-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = G_FMUL [[UV]], [[UV2]]
- ; GFX10-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = G_FADD [[UV4]], [[FMUL]]
- ; GFX10-UNSAFE-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
- ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>)
- ; GFX10-UNSAFE-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
%4:_(<2 x s16>) = COPY $vgpr0
%5:_(<2 x s16>) = COPY $vgpr1
%10:_(<2 x s16>) = G_IMPLICIT_DEF
@@ -1846,6 +2043,134 @@ body: |
...
---
+name: test_3xhalf_add_mul_rhs_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; GFX9-LABEL: name: test_3xhalf_add_mul_rhs_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+ ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX9-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX9-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
+ ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = contract G_FMUL [[UV]], [[UV2]]
+ ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = contract G_FADD [[UV4]], [[FMUL]]
+ ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
+ ; GFX9-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>)
+ ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX9-DENORM-LABEL: name: test_3xhalf_add_mul_rhs_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+ ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
+ ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = contract G_FMUL [[UV]], [[UV2]]
+ ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = contract G_FADD [[UV4]], [[FMUL]]
+ ; GFX9-DENORM-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
+ ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>)
+ ; GFX9-DENORM-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-LABEL: name: test_3xhalf_add_mul_rhs_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+ ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX10-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX10-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX10-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
+ ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = contract G_FMUL [[UV]], [[UV2]]
+ ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = contract G_FADD [[UV4]], [[FMUL]]
+ ; GFX10-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
+ ; GFX10-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>)
+ ; GFX10-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-DENORM-LABEL: name: test_3xhalf_add_mul_rhs_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+ ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
+ ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = contract G_FMUL [[UV]], [[UV2]]
+ ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = contract G_FADD [[UV4]], [[FMUL]]
+ ; GFX10-DENORM-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
+ ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>)
+ ; GFX10-DENORM-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ %4:_(<2 x s16>) = COPY $vgpr0
+ %5:_(<2 x s16>) = COPY $vgpr1
+ %10:_(<2 x s16>) = G_IMPLICIT_DEF
+ %11:_(<6 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>), %10(<2 x s16>)
+ %0:_(<3 x s16>), %12:_(<3 x s16>) = G_UNMERGE_VALUES %11(<6 x s16>)
+ %6:_(<2 x s16>) = COPY $vgpr2
+ %7:_(<2 x s16>) = COPY $vgpr3
+ %13:_(<6 x s16>) = G_CONCAT_VECTORS %6(<2 x s16>), %7(<2 x s16>), %10(<2 x s16>)
+ %1:_(<3 x s16>), %14:_(<3 x s16>) = G_UNMERGE_VALUES %13(<6 x s16>)
+ %8:_(<2 x s16>) = COPY $vgpr4
+ %9:_(<2 x s16>) = COPY $vgpr5
+ %15:_(<6 x s16>) = G_CONCAT_VECTORS %8(<2 x s16>), %9(<2 x s16>), %10(<2 x s16>)
+ %2:_(<3 x s16>), %16:_(<3 x s16>) = G_UNMERGE_VALUES %15(<6 x s16>)
+ %17:_(<3 x s16>) = contract G_FMUL %0, %1
+ %18:_(<3 x s16>) = contract G_FADD %2, %17
+ %22:_(<3 x s16>) = G_IMPLICIT_DEF
+ %23:_(<6 x s16>) = G_CONCAT_VECTORS %18(<3 x s16>), %22(<3 x s16>)
+ %20:_(<2 x s16>), %21:_(<2 x s16>), %24:_(<2 x s16>) = G_UNMERGE_VALUES %23(<6 x s16>)
+ $vgpr0 = COPY %20(<2 x s16>)
+ $vgpr1 = COPY %21(<2 x s16>)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
+
+---
name: test_4xdouble_add_mul
body: |
bb.1.entry:
@@ -1905,60 +2230,7 @@ body: |
; GFX9-NEXT: $vgpr6 = COPY [[UV6]](s32)
; GFX9-NEXT: $vgpr7 = COPY [[UV7]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
- ; GFX9-CONTRACT-LABEL: name: test_4xdouble_add_mul
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
- ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX9-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX9-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX9-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX9-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX9-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX9-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX9-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
- ; GFX9-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX9-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX9-CONTRACT-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
- ; GFX9-CONTRACT-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
- ; GFX9-CONTRACT-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
- ; GFX9-CONTRACT-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
- ; GFX9-CONTRACT-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
- ; GFX9-CONTRACT-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
- ; GFX9-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
- ; GFX9-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX9-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr6 = COPY [[UV6]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr7 = COPY [[UV7]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
; GFX9-DENORM-LABEL: name: test_4xdouble_add_mul
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX9-DENORM-NEXT: {{ $}}
@@ -2013,60 +2285,7 @@ body: |
; GFX9-DENORM-NEXT: $vgpr6 = COPY [[UV6]](s32)
; GFX9-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
- ; GFX9-UNSAFE-LABEL: name: test_4xdouble_add_mul
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
- ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX9-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX9-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX9-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX9-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX9-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX9-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX9-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
- ; GFX9-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX9-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX9-UNSAFE-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
- ; GFX9-UNSAFE-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
- ; GFX9-UNSAFE-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
- ; GFX9-UNSAFE-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
- ; GFX9-UNSAFE-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
- ; GFX9-UNSAFE-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
- ; GFX9-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
- ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX9-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr6 = COPY [[UV6]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr7 = COPY [[UV7]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
; GFX10-LABEL: name: test_4xdouble_add_mul
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX10-NEXT: {{ $}}
@@ -2121,60 +2340,7 @@ body: |
; GFX10-NEXT: $vgpr6 = COPY [[UV6]](s32)
; GFX10-NEXT: $vgpr7 = COPY [[UV7]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
- ; GFX10-CONTRACT-LABEL: name: test_4xdouble_add_mul
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
- ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX10-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX10-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX10-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX10-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX10-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX10-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX10-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
- ; GFX10-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX10-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX10-CONTRACT-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
- ; GFX10-CONTRACT-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
- ; GFX10-CONTRACT-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
- ; GFX10-CONTRACT-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
- ; GFX10-CONTRACT-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
- ; GFX10-CONTRACT-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
- ; GFX10-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
- ; GFX10-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX10-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr6 = COPY [[UV6]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr7 = COPY [[UV7]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
; GFX10-DENORM-LABEL: name: test_4xdouble_add_mul
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX10-DENORM-NEXT: {{ $}}
@@ -2229,60 +2395,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr6 = COPY [[UV6]](s32)
; GFX10-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
- ; GFX10-UNSAFE-LABEL: name: test_4xdouble_add_mul
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
- ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX10-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX10-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX10-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX10-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX10-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX10-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX10-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
- ; GFX10-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX10-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX10-UNSAFE-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
- ; GFX10-UNSAFE-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
- ; GFX10-UNSAFE-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
- ; GFX10-UNSAFE-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
- ; GFX10-UNSAFE-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
- ; GFX10-UNSAFE-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
- ; GFX10-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
- ; GFX10-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX10-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr6 = COPY [[UV6]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr7 = COPY [[UV7]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
%4:_(s32) = COPY $vgpr0
%5:_(s32) = COPY $vgpr1
%6:_(s32) = COPY $vgpr2
@@ -2337,6 +2449,284 @@ body: |
...
---
+name: test_4xdouble_add_mul_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+ ; GFX9-LABEL: name: test_4xdouble_add_mul_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+ ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX9-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX9-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX9-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
+ ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
+ ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
+ ; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
+ ; GFX9-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
+ ; GFX9-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
+ ; GFX9-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX9-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
+ ; GFX9-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
+ ; GFX9-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
+ ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+ ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]]
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX9-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX9-NEXT: $vgpr6 = COPY [[UV6]](s32)
+ ; GFX9-NEXT: $vgpr7 = COPY [[UV7]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX9-DENORM-LABEL: name: test_4xdouble_add_mul_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+ ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX9-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX9-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX9-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX9-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX9-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX9-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX9-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX9-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX9-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX9-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
+ ; GFX9-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX9-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX9-DENORM-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
+ ; GFX9-DENORM-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
+ ; GFX9-DENORM-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
+ ; GFX9-DENORM-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
+ ; GFX9-DENORM-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
+ ; GFX9-DENORM-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
+ ; GFX9-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX9-DENORM-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
+ ; GFX9-DENORM-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
+ ; GFX9-DENORM-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+ ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]]
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr6 = COPY [[UV6]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX10-LABEL: name: test_4xdouble_add_mul_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+ ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX10-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX10-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX10-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX10-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
+ ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
+ ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
+ ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
+ ; GFX10-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
+ ; GFX10-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX10-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
+ ; GFX10-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
+ ; GFX10-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
+ ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+ ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]]
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX10-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX10-NEXT: $vgpr6 = COPY [[UV6]](s32)
+ ; GFX10-NEXT: $vgpr7 = COPY [[UV7]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX10-DENORM-LABEL: name: test_4xdouble_add_mul_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+ ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX10-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX10-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX10-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX10-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX10-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX10-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX10-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX10-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX10-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX10-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
+ ; GFX10-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX10-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX10-DENORM-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
+ ; GFX10-DENORM-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
+ ; GFX10-DENORM-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
+ ; GFX10-DENORM-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
+ ; GFX10-DENORM-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
+ ; GFX10-DENORM-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
+ ; GFX10-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX10-DENORM-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
+ ; GFX10-DENORM-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
+ ; GFX10-DENORM-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+ ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]]
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr6 = COPY [[UV6]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ %4:_(s32) = COPY $vgpr0
+ %5:_(s32) = COPY $vgpr1
+ %6:_(s32) = COPY $vgpr2
+ %7:_(s32) = COPY $vgpr3
+ %8:_(s32) = COPY $vgpr4
+ %9:_(s32) = COPY $vgpr5
+ %10:_(s32) = COPY $vgpr6
+ %11:_(s32) = COPY $vgpr7
+ %28:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+ %29:_(s64) = G_MERGE_VALUES %6(s32), %7(s32)
+ %30:_(s64) = G_MERGE_VALUES %8(s32), %9(s32)
+ %31:_(s64) = G_MERGE_VALUES %10(s32), %11(s32)
+ %0:_(<4 x s64>) = G_BUILD_VECTOR %28(s64), %29(s64), %30(s64), %31(s64)
+ %12:_(s32) = COPY $vgpr8
+ %13:_(s32) = COPY $vgpr9
+ %14:_(s32) = COPY $vgpr10
+ %15:_(s32) = COPY $vgpr11
+ %16:_(s32) = COPY $vgpr12
+ %17:_(s32) = COPY $vgpr13
+ %18:_(s32) = COPY $vgpr14
+ %19:_(s32) = COPY $vgpr15
+ %32:_(s64) = G_MERGE_VALUES %12(s32), %13(s32)
+ %33:_(s64) = G_MERGE_VALUES %14(s32), %15(s32)
+ %34:_(s64) = G_MERGE_VALUES %16(s32), %17(s32)
+ %35:_(s64) = G_MERGE_VALUES %18(s32), %19(s32)
+ %1:_(<4 x s64>) = G_BUILD_VECTOR %32(s64), %33(s64), %34(s64), %35(s64)
+ %20:_(s32) = COPY $vgpr16
+ %21:_(s32) = COPY $vgpr17
+ %22:_(s32) = COPY $vgpr18
+ %23:_(s32) = COPY $vgpr19
+ %24:_(s32) = COPY $vgpr20
+ %25:_(s32) = COPY $vgpr21
+ %26:_(s32) = COPY $vgpr22
+ %27:_(s32) = COPY $vgpr23
+ %36:_(s64) = G_MERGE_VALUES %20(s32), %21(s32)
+ %37:_(s64) = G_MERGE_VALUES %22(s32), %23(s32)
+ %38:_(s64) = G_MERGE_VALUES %24(s32), %25(s32)
+ %39:_(s64) = G_MERGE_VALUES %26(s32), %27(s32)
+ %2:_(<4 x s64>) = G_BUILD_VECTOR %36(s64), %37(s64), %38(s64), %39(s64)
+ %40:_(<4 x s64>) = contract G_FMUL %0, %1
+ %41:_(<4 x s64>) = contract G_FADD %40, %2
+ %43:_(s32), %44:_(s32), %45:_(s32), %46:_(s32), %47:_(s32), %48:_(s32), %49:_(s32), %50:_(s32) = G_UNMERGE_VALUES %41(<4 x s64>)
+ $vgpr0 = COPY %43(s32)
+ $vgpr1 = COPY %44(s32)
+ $vgpr2 = COPY %45(s32)
+ $vgpr3 = COPY %46(s32)
+ $vgpr4 = COPY %47(s32)
+ $vgpr5 = COPY %48(s32)
+ $vgpr6 = COPY %49(s32)
+ $vgpr7 = COPY %50(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+...
+
+---
name: test_3xdouble_add_mul_rhs
body: |
bb.1.entry:
@@ -2385,49 +2775,7 @@ body: |
; GFX9-NEXT: $vgpr4 = COPY [[UV4]](s32)
; GFX9-NEXT: $vgpr5 = COPY [[UV5]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
- ; GFX9-CONTRACT-LABEL: name: test_3xdouble_add_mul_rhs
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
- ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX9-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX9-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX9-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
- ; GFX9-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX9-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX9-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX9-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX9-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX9-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX9-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
- ; GFX9-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX9-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+ ;
; GFX9-DENORM-LABEL: name: test_3xdouble_add_mul_rhs
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
; GFX9-DENORM-NEXT: {{ $}}
@@ -2471,49 +2819,7 @@ body: |
; GFX9-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
; GFX9-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
- ; GFX9-UNSAFE-LABEL: name: test_3xdouble_add_mul_rhs
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
- ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX9-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX9-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX9-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
- ; GFX9-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX9-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX9-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX9-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX9-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX9-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX9-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
- ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX9-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+ ;
; GFX10-LABEL: name: test_3xdouble_add_mul_rhs
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
; GFX10-NEXT: {{ $}}
@@ -2557,49 +2863,7 @@ body: |
; GFX10-NEXT: $vgpr4 = COPY [[UV4]](s32)
; GFX10-NEXT: $vgpr5 = COPY [[UV5]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
- ; GFX10-CONTRACT-LABEL: name: test_3xdouble_add_mul_rhs
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
- ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX10-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX10-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX10-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
- ; GFX10-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX10-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX10-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX10-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX10-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX10-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX10-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
- ; GFX10-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX10-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+ ;
; GFX10-DENORM-LABEL: name: test_3xdouble_add_mul_rhs
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
; GFX10-DENORM-NEXT: {{ $}}
@@ -2643,49 +2907,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
; GFX10-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
- ; GFX10-UNSAFE-LABEL: name: test_3xdouble_add_mul_rhs
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
- ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX10-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX10-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX10-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
- ; GFX10-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX10-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX10-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX10-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX10-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX10-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX10-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
- ; GFX10-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
- ; GFX10-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
%4:_(s32) = COPY $vgpr0
%5:_(s32) = COPY $vgpr1
%6:_(s32) = COPY $vgpr2
@@ -2727,3 +2948,226 @@ body: |
$vgpr5 = COPY %39(s32)
S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
...
+
+---
+name: test_3xdouble_add_mul_rhs_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+
+ ; GFX9-LABEL: name: test_3xdouble_add_mul_rhs_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
+ ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX9-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX9-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX9-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX9-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+ ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX9-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+ ;
+ ; GFX9-DENORM-LABEL: name: test_3xdouble_add_mul_rhs_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
+ ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX9-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX9-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX9-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX9-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX9-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
+ ; GFX9-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX9-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX9-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX9-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX9-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX9-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX9-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX9-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX9-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+ ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+ ;
+ ; GFX10-LABEL: name: test_3xdouble_add_mul_rhs_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX10-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX10-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
+ ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX10-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX10-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX10-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+ ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX10-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+ ;
+ ; GFX10-DENORM-LABEL: name: test_3xdouble_add_mul_rhs_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
+ ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX10-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX10-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX10-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX10-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
+ ; GFX10-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX10-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX10-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX10-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX10-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX10-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX10-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX10-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX10-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+ ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+ %4:_(s32) = COPY $vgpr0
+ %5:_(s32) = COPY $vgpr1
+ %6:_(s32) = COPY $vgpr2
+ %7:_(s32) = COPY $vgpr3
+ %8:_(s32) = COPY $vgpr4
+ %9:_(s32) = COPY $vgpr5
+ %22:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+ %23:_(s64) = G_MERGE_VALUES %6(s32), %7(s32)
+ %24:_(s64) = G_MERGE_VALUES %8(s32), %9(s32)
+ %0:_(<3 x s64>) = G_BUILD_VECTOR %22(s64), %23(s64), %24(s64)
+ %10:_(s32) = COPY $vgpr6
+ %11:_(s32) = COPY $vgpr7
+ %12:_(s32) = COPY $vgpr8
+ %13:_(s32) = COPY $vgpr9
+ %14:_(s32) = COPY $vgpr10
+ %15:_(s32) = COPY $vgpr11
+ %25:_(s64) = G_MERGE_VALUES %10(s32), %11(s32)
+ %26:_(s64) = G_MERGE_VALUES %12(s32), %13(s32)
+ %27:_(s64) = G_MERGE_VALUES %14(s32), %15(s32)
+ %1:_(<3 x s64>) = G_BUILD_VECTOR %25(s64), %26(s64), %27(s64)
+ %16:_(s32) = COPY $vgpr12
+ %17:_(s32) = COPY $vgpr13
+ %18:_(s32) = COPY $vgpr14
+ %19:_(s32) = COPY $vgpr15
+ %20:_(s32) = COPY $vgpr16
+ %21:_(s32) = COPY $vgpr17
+ %28:_(s64) = G_MERGE_VALUES %16(s32), %17(s32)
+ %29:_(s64) = G_MERGE_VALUES %18(s32), %19(s32)
+ %30:_(s64) = G_MERGE_VALUES %20(s32), %21(s32)
+ %2:_(<3 x s64>) = G_BUILD_VECTOR %28(s64), %29(s64), %30(s64)
+ %31:_(<3 x s64>) = contract G_FMUL %0, %1
+ %32:_(<3 x s64>) = contract G_FADD %2, %31
+ %34:_(s32), %35:_(s32), %36:_(s32), %37:_(s32), %38:_(s32), %39:_(s32) = G_UNMERGE_VALUES %32(<3 x s64>)
+ $vgpr0 = COPY %34(s32)
+ $vgpr1 = COPY %35(s32)
+ $vgpr2 = COPY %36(s32)
+ $vgpr3 = COPY %37(s32)
+ $vgpr4 = COPY %38(s32)
+ $vgpr5 = COPY %39(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir
index 42e53be..8f9fc67 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir
@@ -1,12 +1,8 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck -check-prefix=GFX9 %s
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-prelegalizer-combiner -fp-contract=fast %s -o - | FileCheck -check-prefix=GFX9-CONTRACT %s
# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-prelegalizer-combiner --denormal-fp-math=preserve-sign %s -o - | FileCheck -check-prefix=GFX9-DENORM %s
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-prelegalizer-combiner -enable-unsafe-fp-math %s -o - | FileCheck -check-prefix=GFX9-UNSAFE %s
# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck -check-prefix=GFX10 %s
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-prelegalizer-combiner -fp-contract=fast %s -o - | FileCheck -check-prefix=GFX10-CONTRACT %s
# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-prelegalizer-combiner --denormal-fp-math=preserve-sign %s -o - | FileCheck -check-prefix=GFX10-DENORM %s
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-prelegalizer-combiner -enable-unsafe-fp-math %s -o - | FileCheck -check-prefix=GFX10-UNSAFE %s
---
name: test_f32_add_mul
@@ -25,16 +21,6 @@ body: |
; GFX9-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
;
- ; GFX9-CONTRACT-LABEL: name: test_f32_add_mul
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
; GFX9-DENORM-LABEL: name: test_f32_add_mul
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX9-DENORM-NEXT: {{ $}}
@@ -46,16 +32,6 @@ body: |
; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
;
- ; GFX9-UNSAFE-LABEL: name: test_f32_add_mul
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
; GFX10-LABEL: name: test_f32_add_mul
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-NEXT: {{ $}}
@@ -67,16 +43,6 @@ body: |
; GFX10-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
;
- ; GFX10-CONTRACT-LABEL: name: test_f32_add_mul
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
; GFX10-DENORM-LABEL: name: test_f32_add_mul
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-DENORM-NEXT: {{ $}}
@@ -87,16 +53,6 @@ body: |
; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = reassoc G_FADD [[FMUL]], [[COPY2]]
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
- ; GFX10-UNSAFE-LABEL: name: test_f32_add_mul
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s32) = COPY $vgpr2
@@ -107,6 +63,60 @@ body: |
...
---
+name: test_f32_add_mul_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; GFX9-LABEL: name: test_f32_add_mul_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX9-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX9-DENORM-LABEL: name: test_f32_add_mul_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-LABEL: name: test_f32_add_mul_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-DENORM-LABEL: name: test_f32_add_mul_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(s32) = COPY $vgpr2
+ %4:_(s32) = reassoc contract G_FMUL %0, %1
+ %5:_(s32) = reassoc contract G_FADD %4, %2
+ $vgpr0 = COPY %5(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+...
+
+---
name: test_f32_add_mul_rhs
body: |
bb.1.entry:
@@ -123,16 +133,6 @@ body: |
; GFX9-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
;
- ; GFX9-CONTRACT-LABEL: name: test_f32_add_mul_rhs
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
; GFX9-DENORM-LABEL: name: test_f32_add_mul_rhs
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX9-DENORM-NEXT: {{ $}}
@@ -144,16 +144,6 @@ body: |
; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
;
- ; GFX9-UNSAFE-LABEL: name: test_f32_add_mul_rhs
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
; GFX10-LABEL: name: test_f32_add_mul_rhs
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-NEXT: {{ $}}
@@ -165,16 +155,6 @@ body: |
; GFX10-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
;
- ; GFX10-CONTRACT-LABEL: name: test_f32_add_mul_rhs
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
; GFX10-DENORM-LABEL: name: test_f32_add_mul_rhs
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-DENORM-NEXT: {{ $}}
@@ -185,16 +165,6 @@ body: |
; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = reassoc G_FADD [[COPY2]], [[FMUL]]
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
- ; GFX10-UNSAFE-LABEL: name: test_f32_add_mul_rhs
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s32) = COPY $vgpr2
@@ -205,6 +175,60 @@ body: |
...
---
+name: test_f32_add_mul_rhs_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; GFX9-LABEL: name: test_f32_add_mul_rhs_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX9-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX9-DENORM-LABEL: name: test_f32_add_mul_rhs_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-LABEL: name: test_f32_add_mul_rhs_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-DENORM-LABEL: name: test_f32_add_mul_rhs_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(s32) = COPY $vgpr2
+ %4:_(s32) = reassoc contract G_FMUL %0, %1
+ %5:_(s32) = reassoc contract G_FADD %2, %4
+ $vgpr0 = COPY %5(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+...
+
+---
name: test_half_add_mul
body: |
bb.1.entry:
@@ -225,20 +249,6 @@ body: |
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
;
- ; GFX9-CONTRACT-LABEL: name: test_half_add_mul
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX9-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
; GFX9-DENORM-LABEL: name: test_half_add_mul
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX9-DENORM-NEXT: {{ $}}
@@ -254,20 +264,6 @@ body: |
; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
;
- ; GFX9-UNSAFE-LABEL: name: test_half_add_mul
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
; GFX10-LABEL: name: test_half_add_mul
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-NEXT: {{ $}}
@@ -283,20 +279,6 @@ body: |
; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
;
- ; GFX10-CONTRACT-LABEL: name: test_half_add_mul
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX10-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
; GFX10-DENORM-LABEL: name: test_half_add_mul
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-DENORM-NEXT: {{ $}}
@@ -311,20 +293,6 @@ body: |
; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
- ; GFX10-UNSAFE-LABEL: name: test_half_add_mul
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX10-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
%4:_(s32) = COPY $vgpr0
%0:_(s16) = G_TRUNC %4(s32)
%5:_(s32) = COPY $vgpr1
@@ -339,6 +307,81 @@ body: |
...
---
+name: test_half_add_mul_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; GFX9-LABEL: name: test_half_add_mul_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+ ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+ ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX9-DENORM-LABEL: name: test_half_add_mul_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+ ; GFX9-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-LABEL: name: test_half_add_mul_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+ ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+ ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-DENORM-LABEL: name: test_half_add_mul_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+ ; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ %4:_(s32) = COPY $vgpr0
+ %0:_(s16) = G_TRUNC %4(s32)
+ %5:_(s32) = COPY $vgpr1
+ %1:_(s16) = G_TRUNC %5(s32)
+ %6:_(s32) = COPY $vgpr2
+ %2:_(s16) = G_TRUNC %6(s32)
+ %7:_(s16) = reassoc contract G_FMUL %0, %1
+ %8:_(s16) = reassoc contract G_FADD %7, %2
+ %10:_(s32) = G_ANYEXT %8(s16)
+ $vgpr0 = COPY %10(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+...
+
+
+---
name: test_half_add_mul_rhs
body: |
bb.1.entry:
@@ -359,20 +402,6 @@ body: |
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
;
- ; GFX9-CONTRACT-LABEL: name: test_half_add_mul_rhs
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX9-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
; GFX9-DENORM-LABEL: name: test_half_add_mul_rhs
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX9-DENORM-NEXT: {{ $}}
@@ -388,20 +417,6 @@ body: |
; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
;
- ; GFX9-UNSAFE-LABEL: name: test_half_add_mul_rhs
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
; GFX10-LABEL: name: test_half_add_mul_rhs
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-NEXT: {{ $}}
@@ -417,20 +432,6 @@ body: |
; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
;
- ; GFX10-CONTRACT-LABEL: name: test_half_add_mul_rhs
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX10-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
- ;
; GFX10-DENORM-LABEL: name: test_half_add_mul_rhs
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX10-DENORM-NEXT: {{ $}}
@@ -445,20 +446,84 @@ body: |
; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ %4:_(s32) = COPY $vgpr0
+ %0:_(s16) = G_TRUNC %4(s32)
+ %5:_(s32) = COPY $vgpr1
+ %1:_(s16) = G_TRUNC %5(s32)
+ %6:_(s32) = COPY $vgpr2
+ %2:_(s16) = G_TRUNC %6(s32)
+ %7:_(s16) = reassoc G_FMUL %0, %1
+ %8:_(s16) = reassoc G_FADD %2, %7
+ %10:_(s32) = G_ANYEXT %8(s16)
+ $vgpr0 = COPY %10(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+...
+
+---
+name: test_half_add_mul_rhs_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; GFX9-LABEL: name: test_half_add_mul_rhs_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s16) = reassoc G_FMUL [[TRUNC]], [[TRUNC1]]
+ ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s16) = reassoc G_FADD [[TRUNC2]], [[FMUL]]
+ ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
+ ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX9-DENORM-LABEL: name: test_half_add_mul_rhs_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(s16) = reassoc G_FMUL [[TRUNC]], [[TRUNC1]]
+ ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s16) = reassoc G_FADD [[TRUNC2]], [[FMUL]]
+ ; GFX9-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
;
- ; GFX10-UNSAFE-LABEL: name: test_half_add_mul_rhs
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
- ; GFX10-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ; GFX10-LABEL: name: test_half_add_mul_rhs_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s16) = reassoc G_FMUL [[TRUNC]], [[TRUNC1]]
+ ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s16) = reassoc G_FADD [[TRUNC2]], [[FMUL]]
+ ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
+ ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+ ;
+ ; GFX10-DENORM-LABEL: name: test_half_add_mul_rhs_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+ ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(s16) = reassoc G_FMUL [[TRUNC]], [[TRUNC1]]
+ ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s16) = reassoc G_FADD [[TRUNC2]], [[FMUL]]
+ ; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
%4:_(s32) = COPY $vgpr0
%0:_(s16) = G_TRUNC %4(s32)
%5:_(s32) = COPY $vgpr1
@@ -497,24 +562,6 @@ body: |
; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX9-CONTRACT-LABEL: name: test_double_add_mul
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX9-DENORM-LABEL: name: test_double_add_mul
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX9-DENORM-NEXT: {{ $}}
@@ -534,24 +581,6 @@ body: |
; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX9-UNSAFE-LABEL: name: test_double_add_mul
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX10-LABEL: name: test_double_add_mul
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-NEXT: {{ $}}
@@ -571,24 +600,6 @@ body: |
; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX10-CONTRACT-LABEL: name: test_double_add_mul
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX10-DENORM-LABEL: name: test_double_add_mul
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-DENORM-NEXT: {{ $}}
@@ -607,24 +618,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
- ; GFX10-UNSAFE-LABEL: name: test_double_add_mul
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
%4:_(s32) = COPY $vgpr0
%5:_(s32) = COPY $vgpr1
%0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
@@ -643,6 +636,100 @@ body: |
...
---
+name: test_double_add_mul_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; GFX9-LABEL: name: test_double_add_mul_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX9-DENORM-LABEL: name: test_double_add_mul_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-LABEL: name: test_double_add_mul_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-DENORM-LABEL: name: test_double_add_mul_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ %4:_(s32) = COPY $vgpr0
+ %5:_(s32) = COPY $vgpr1
+ %0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+ %6:_(s32) = COPY $vgpr2
+ %7:_(s32) = COPY $vgpr3
+ %1:_(s64) = G_MERGE_VALUES %6(s32), %7(s32)
+ %8:_(s32) = COPY $vgpr4
+ %9:_(s32) = COPY $vgpr5
+ %2:_(s64) = G_MERGE_VALUES %8(s32), %9(s32)
+ %10:_(s64) = reassoc contract G_FMUL %0, %1
+ %11:_(s64) = reassoc contract G_FADD %10, %2
+ %13:_(s32), %14:_(s32) = G_UNMERGE_VALUES %11(s64)
+ $vgpr0 = COPY %13(s32)
+ $vgpr1 = COPY %14(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
+
+---
name: test_double_add_mul_rhs
body: |
bb.1.entry:
@@ -667,24 +754,6 @@ body: |
; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX9-CONTRACT-LABEL: name: test_double_add_mul_rhs
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX9-DENORM-LABEL: name: test_double_add_mul_rhs
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX9-DENORM-NEXT: {{ $}}
@@ -704,24 +773,6 @@ body: |
; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX9-UNSAFE-LABEL: name: test_double_add_mul_rhs
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX10-LABEL: name: test_double_add_mul_rhs
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-NEXT: {{ $}}
@@ -741,24 +792,6 @@ body: |
; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX10-CONTRACT-LABEL: name: test_double_add_mul_rhs
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX10-DENORM-LABEL: name: test_double_add_mul_rhs
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-DENORM-NEXT: {{ $}}
@@ -777,24 +810,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
- ; GFX10-UNSAFE-LABEL: name: test_double_add_mul_rhs
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
%4:_(s32) = COPY $vgpr0
%5:_(s32) = COPY $vgpr1
%0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
@@ -813,6 +828,100 @@ body: |
...
---
+name: test_double_add_mul_rhs_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; GFX9-LABEL: name: test_double_add_mul_rhs_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX9-DENORM-LABEL: name: test_double_add_mul_rhs_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-LABEL: name: test_double_add_mul_rhs_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-DENORM-LABEL: name: test_double_add_mul_rhs_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ %4:_(s32) = COPY $vgpr0
+ %5:_(s32) = COPY $vgpr1
+ %0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+ %6:_(s32) = COPY $vgpr2
+ %7:_(s32) = COPY $vgpr3
+ %1:_(s64) = G_MERGE_VALUES %6(s32), %7(s32)
+ %8:_(s32) = COPY $vgpr4
+ %9:_(s32) = COPY $vgpr5
+ %2:_(s64) = G_MERGE_VALUES %8(s32), %9(s32)
+ %10:_(s64) = reassoc contract G_FMUL %0, %1
+ %11:_(s64) = reassoc contract G_FADD %2, %10
+ %13:_(s32), %14:_(s32) = G_UNMERGE_VALUES %11(s64)
+ $vgpr0 = COPY %13(s32)
+ $vgpr1 = COPY %14(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
+
+---
name: test_4xfloat_add_mul
body: |
bb.1.entry:
@@ -845,32 +954,6 @@ body: |
; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
- ; GFX9-CONTRACT-LABEL: name: test_4xfloat_add_mul
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX9-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX9-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
- ;
; GFX9-DENORM-LABEL: name: test_4xfloat_add_mul
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
; GFX9-DENORM-NEXT: {{ $}}
@@ -898,32 +981,6 @@ body: |
; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
- ; GFX9-UNSAFE-LABEL: name: test_4xfloat_add_mul
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX9-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX9-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
- ;
; GFX10-LABEL: name: test_4xfloat_add_mul
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
; GFX10-NEXT: {{ $}}
@@ -951,32 +1008,6 @@ body: |
; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
- ; GFX10-CONTRACT-LABEL: name: test_4xfloat_add_mul
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX10-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX10-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
- ;
; GFX10-DENORM-LABEL: name: test_4xfloat_add_mul
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
; GFX10-DENORM-NEXT: {{ $}}
@@ -1003,32 +1034,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
- ;
- ; GFX10-UNSAFE-LABEL: name: test_4xfloat_add_mul
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX10-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX10-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%4:_(s32) = COPY $vgpr0
%5:_(s32) = COPY $vgpr1
%6:_(s32) = COPY $vgpr2
@@ -1055,6 +1060,140 @@ body: |
...
---
+name: test_4xfloat_add_mul_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+
+ ; GFX9-LABEL: name: test_4xfloat_add_mul_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ;
+ ; GFX9-DENORM-LABEL: name: test_4xfloat_add_mul_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX9-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX9-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ;
+ ; GFX10-LABEL: name: test_4xfloat_add_mul_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ;
+ ; GFX10-DENORM-LABEL: name: test_4xfloat_add_mul_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX10-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX10-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ %4:_(s32) = COPY $vgpr0
+ %5:_(s32) = COPY $vgpr1
+ %6:_(s32) = COPY $vgpr2
+ %7:_(s32) = COPY $vgpr3
+ %0:_(<4 x s32>) = G_BUILD_VECTOR %4(s32), %5(s32), %6(s32), %7(s32)
+ %8:_(s32) = COPY $vgpr4
+ %9:_(s32) = COPY $vgpr5
+ %10:_(s32) = COPY $vgpr6
+ %11:_(s32) = COPY $vgpr7
+ %1:_(<4 x s32>) = G_BUILD_VECTOR %8(s32), %9(s32), %10(s32), %11(s32)
+ %12:_(s32) = COPY $vgpr8
+ %13:_(s32) = COPY $vgpr9
+ %14:_(s32) = COPY $vgpr10
+ %15:_(s32) = COPY $vgpr11
+ %2:_(<4 x s32>) = G_BUILD_VECTOR %12(s32), %13(s32), %14(s32), %15(s32)
+ %16:_(<4 x s32>) = reassoc contract G_FMUL %0, %1
+ %17:_(<4 x s32>) = reassoc contract G_FADD %16, %2
+ %19:_(s32), %20:_(s32), %21:_(s32), %22:_(s32) = G_UNMERGE_VALUES %17(<4 x s32>)
+ $vgpr0 = COPY %19(s32)
+ $vgpr1 = COPY %20(s32)
+ $vgpr2 = COPY %21(s32)
+ $vgpr3 = COPY %22(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+...
+
+---
name: test_3xfloat_add_mul_rhs
body: |
bb.1.entry:
@@ -1083,28 +1222,6 @@ body: |
; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
;
- ; GFX9-CONTRACT-LABEL: name: test_3xfloat_add_mul_rhs
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
- ;
; GFX9-DENORM-LABEL: name: test_3xfloat_add_mul_rhs
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; GFX9-DENORM-NEXT: {{ $}}
@@ -1128,28 +1245,6 @@ body: |
; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
;
- ; GFX9-UNSAFE-LABEL: name: test_3xfloat_add_mul_rhs
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
- ;
; GFX10-LABEL: name: test_3xfloat_add_mul_rhs
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; GFX10-NEXT: {{ $}}
@@ -1173,28 +1268,6 @@ body: |
; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
;
- ; GFX10-CONTRACT-LABEL: name: test_3xfloat_add_mul_rhs
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
- ;
; GFX10-DENORM-LABEL: name: test_3xfloat_add_mul_rhs
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; GFX10-DENORM-NEXT: {{ $}}
@@ -1217,28 +1290,124 @@ body: |
; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ %4:_(s32) = COPY $vgpr0
+ %5:_(s32) = COPY $vgpr1
+ %6:_(s32) = COPY $vgpr2
+ %0:_(<3 x s32>) = G_BUILD_VECTOR %4(s32), %5(s32), %6(s32)
+ %7:_(s32) = COPY $vgpr3
+ %8:_(s32) = COPY $vgpr4
+ %9:_(s32) = COPY $vgpr5
+ %1:_(<3 x s32>) = G_BUILD_VECTOR %7(s32), %8(s32), %9(s32)
+ %10:_(s32) = COPY $vgpr6
+ %11:_(s32) = COPY $vgpr7
+ %12:_(s32) = COPY $vgpr8
+ %2:_(<3 x s32>) = G_BUILD_VECTOR %10(s32), %11(s32), %12(s32)
+ %13:_(<3 x s32>) = reassoc G_FMUL %0, %1
+ %14:_(<3 x s32>) = reassoc G_FADD %2, %13
+ %16:_(s32), %17:_(s32), %18:_(s32) = G_UNMERGE_VALUES %14(<3 x s32>)
+ $vgpr0 = COPY %16(s32)
+ $vgpr1 = COPY %17(s32)
+ $vgpr2 = COPY %18(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+...
+
+---
+name: test_3xfloat_add_mul_rhs_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+
+ ; GFX9-LABEL: name: test_3xfloat_add_mul_rhs_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+ ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ ;
+ ; GFX9-DENORM-LABEL: name: test_3xfloat_add_mul_rhs_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+ ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ ;
+ ; GFX10-LABEL: name: test_3xfloat_add_mul_rhs_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+ ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
;
- ; GFX10-UNSAFE-LABEL: name: test_3xfloat_add_mul_rhs
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ ; GFX10-DENORM-LABEL: name: test_3xfloat_add_mul_rhs_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+ ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+ ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%4:_(s32) = COPY $vgpr0
%5:_(s32) = COPY $vgpr1
%6:_(s32) = COPY $vgpr2
@@ -1285,24 +1454,6 @@ body: |
; GFX9-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX9-CONTRACT-LABEL: name: test_4xhalf_add_mul
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX9-DENORM-LABEL: name: test_4xhalf_add_mul
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX9-DENORM-NEXT: {{ $}}
@@ -1322,24 +1473,6 @@ body: |
; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX9-UNSAFE-LABEL: name: test_4xhalf_add_mul
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX10-LABEL: name: test_4xhalf_add_mul
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-NEXT: {{ $}}
@@ -1359,24 +1492,6 @@ body: |
; GFX10-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX10-CONTRACT-LABEL: name: test_4xhalf_add_mul
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX10-DENORM-LABEL: name: test_4xhalf_add_mul
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-DENORM-NEXT: {{ $}}
@@ -1395,24 +1510,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
- ; GFX10-UNSAFE-LABEL: name: test_4xhalf_add_mul
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
%4:_(<2 x s16>) = COPY $vgpr0
%5:_(<2 x s16>) = COPY $vgpr1
%0:_(<4 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>)
@@ -1431,6 +1528,100 @@ body: |
...
---
+name: test_4xhalf_add_mul_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; GFX9-LABEL: name: test_4xhalf_add_mul_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX9-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX9-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]]
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX9-DENORM-LABEL: name: test_4xhalf_add_mul_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]]
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-LABEL: name: test_4xhalf_add_mul_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX10-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX10-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]]
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-DENORM-LABEL: name: test_4xhalf_add_mul_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]]
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ %4:_(<2 x s16>) = COPY $vgpr0
+ %5:_(<2 x s16>) = COPY $vgpr1
+ %0:_(<4 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>)
+ %6:_(<2 x s16>) = COPY $vgpr2
+ %7:_(<2 x s16>) = COPY $vgpr3
+ %1:_(<4 x s16>) = G_CONCAT_VECTORS %6(<2 x s16>), %7(<2 x s16>)
+ %8:_(<2 x s16>) = COPY $vgpr4
+ %9:_(<2 x s16>) = COPY $vgpr5
+ %2:_(<4 x s16>) = G_CONCAT_VECTORS %8(<2 x s16>), %9(<2 x s16>)
+ %10:_(<4 x s16>) = reassoc contract G_FMUL %0, %1
+ %11:_(<4 x s16>) = reassoc contract G_FADD %10, %2
+ %13:_(<2 x s16>), %14:_(<2 x s16>) = G_UNMERGE_VALUES %11(<4 x s16>)
+ $vgpr0 = COPY %13(<2 x s16>)
+ $vgpr1 = COPY %14(<2 x s16>)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
+
+---
name: test_3xhalf_add_mul_rhs
body: |
bb.1.entry:
@@ -1461,30 +1652,6 @@ body: |
; GFX9-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX9-CONTRACT-LABEL: name: test_3xhalf_add_mul_rhs
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
- ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]]
- ; GFX9-CONTRACT-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
- ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>)
- ; GFX9-CONTRACT-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX9-DENORM-LABEL: name: test_3xhalf_add_mul_rhs
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX9-DENORM-NEXT: {{ $}}
@@ -1510,30 +1677,6 @@ body: |
; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX9-UNSAFE-LABEL: name: test_3xhalf_add_mul_rhs
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
- ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]]
- ; GFX9-UNSAFE-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
- ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>)
- ; GFX9-UNSAFE-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX10-LABEL: name: test_3xhalf_add_mul_rhs
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-NEXT: {{ $}}
@@ -1559,30 +1702,6 @@ body: |
; GFX10-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
;
- ; GFX10-CONTRACT-LABEL: name: test_3xhalf_add_mul_rhs
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
- ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]]
- ; GFX10-CONTRACT-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
- ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>)
- ; GFX10-CONTRACT-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
; GFX10-DENORM-LABEL: name: test_3xhalf_add_mul_rhs
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GFX10-DENORM-NEXT: {{ $}}
@@ -1607,30 +1726,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
- ;
- ; GFX10-UNSAFE-LABEL: name: test_3xhalf_add_mul_rhs
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
- ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]]
- ; GFX10-UNSAFE-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
- ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>)
- ; GFX10-UNSAFE-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
%4:_(<2 x s16>) = COPY $vgpr0
%5:_(<2 x s16>) = COPY $vgpr1
%10:_(<2 x s16>) = G_IMPLICIT_DEF
@@ -1655,6 +1750,130 @@ body: |
...
---
+name: test_3xhalf_add_mul_rhs_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; GFX9-LABEL: name: test_3xhalf_add_mul_rhs_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+ ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX9-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX9-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]]
+ ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
+ ; GFX9-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>)
+ ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX9-DENORM-LABEL: name: test_3xhalf_add_mul_rhs_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+ ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]]
+ ; GFX9-DENORM-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
+ ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>)
+ ; GFX9-DENORM-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-LABEL: name: test_3xhalf_add_mul_rhs_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+ ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX10-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX10-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX10-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]]
+ ; GFX10-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
+ ; GFX10-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>)
+ ; GFX10-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX10-DENORM-LABEL: name: test_3xhalf_add_mul_rhs_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+ ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]]
+ ; GFX10-DENORM-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
+ ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>)
+ ; GFX10-DENORM-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+ %4:_(<2 x s16>) = COPY $vgpr0
+ %5:_(<2 x s16>) = COPY $vgpr1
+ %10:_(<2 x s16>) = G_IMPLICIT_DEF
+ %11:_(<6 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>), %10(<2 x s16>)
+ %0:_(<3 x s16>), %12:_(<3 x s16>) = G_UNMERGE_VALUES %11(<6 x s16>)
+ %6:_(<2 x s16>) = COPY $vgpr2
+ %7:_(<2 x s16>) = COPY $vgpr3
+ %13:_(<6 x s16>) = G_CONCAT_VECTORS %6(<2 x s16>), %7(<2 x s16>), %10(<2 x s16>)
+ %1:_(<3 x s16>), %14:_(<3 x s16>) = G_UNMERGE_VALUES %13(<6 x s16>)
+ %8:_(<2 x s16>) = COPY $vgpr4
+ %9:_(<2 x s16>) = COPY $vgpr5
+ %15:_(<6 x s16>) = G_CONCAT_VECTORS %8(<2 x s16>), %9(<2 x s16>), %10(<2 x s16>)
+ %2:_(<3 x s16>), %16:_(<3 x s16>) = G_UNMERGE_VALUES %15(<6 x s16>)
+ %17:_(<3 x s16>) = reassoc contract G_FMUL %0, %1
+ %18:_(<3 x s16>) = reassoc contract G_FADD %2, %17
+ %22:_(<3 x s16>) = G_IMPLICIT_DEF
+ %23:_(<6 x s16>) = G_CONCAT_VECTORS %18(<3 x s16>), %22(<3 x s16>)
+ %20:_(<2 x s16>), %21:_(<2 x s16>), %24:_(<2 x s16>) = G_UNMERGE_VALUES %23(<6 x s16>)
+ $vgpr0 = COPY %20(<2 x s16>)
+ $vgpr1 = COPY %21(<2 x s16>)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
+
+---
name: test_4xdouble_add_mul
body: |
bb.1.entry:
@@ -1715,60 +1934,6 @@ body: |
; GFX9-NEXT: $vgpr7 = COPY [[UV7]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
;
- ; GFX9-CONTRACT-LABEL: name: test_4xdouble_add_mul
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
- ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX9-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX9-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX9-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX9-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX9-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX9-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX9-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
- ; GFX9-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX9-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX9-CONTRACT-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
- ; GFX9-CONTRACT-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
- ; GFX9-CONTRACT-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
- ; GFX9-CONTRACT-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
- ; GFX9-CONTRACT-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
- ; GFX9-CONTRACT-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
- ; GFX9-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr6 = COPY [[UV6]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr7 = COPY [[UV7]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
- ;
; GFX9-DENORM-LABEL: name: test_4xdouble_add_mul
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX9-DENORM-NEXT: {{ $}}
@@ -1824,60 +1989,6 @@ body: |
; GFX9-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
;
- ; GFX9-UNSAFE-LABEL: name: test_4xdouble_add_mul
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
- ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX9-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX9-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX9-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX9-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX9-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX9-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX9-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
- ; GFX9-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX9-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX9-UNSAFE-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
- ; GFX9-UNSAFE-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
- ; GFX9-UNSAFE-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
- ; GFX9-UNSAFE-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
- ; GFX9-UNSAFE-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
- ; GFX9-UNSAFE-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
- ; GFX9-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr6 = COPY [[UV6]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr7 = COPY [[UV7]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
- ;
; GFX10-LABEL: name: test_4xdouble_add_mul
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX10-NEXT: {{ $}}
@@ -1933,60 +2044,6 @@ body: |
; GFX10-NEXT: $vgpr7 = COPY [[UV7]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
;
- ; GFX10-CONTRACT-LABEL: name: test_4xdouble_add_mul
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
- ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX10-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX10-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX10-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX10-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX10-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX10-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX10-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
- ; GFX10-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX10-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX10-CONTRACT-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
- ; GFX10-CONTRACT-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
- ; GFX10-CONTRACT-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
- ; GFX10-CONTRACT-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
- ; GFX10-CONTRACT-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
- ; GFX10-CONTRACT-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
- ; GFX10-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr6 = COPY [[UV6]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr7 = COPY [[UV7]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
- ;
; GFX10-DENORM-LABEL: name: test_4xdouble_add_mul
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
; GFX10-DENORM-NEXT: {{ $}}
@@ -2041,60 +2098,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr6 = COPY [[UV6]](s32)
; GFX10-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
- ;
- ; GFX10-UNSAFE-LABEL: name: test_4xdouble_add_mul
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
- ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX10-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX10-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX10-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX10-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX10-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX10-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX10-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
- ; GFX10-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX10-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX10-UNSAFE-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
- ; GFX10-UNSAFE-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
- ; GFX10-UNSAFE-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
- ; GFX10-UNSAFE-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
- ; GFX10-UNSAFE-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
- ; GFX10-UNSAFE-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
- ; GFX10-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr6 = COPY [[UV6]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr7 = COPY [[UV7]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
%4:_(s32) = COPY $vgpr0
%5:_(s32) = COPY $vgpr1
%6:_(s32) = COPY $vgpr2
@@ -2149,6 +2152,280 @@ body: |
...
---
+name: test_4xdouble_add_mul_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+ ; GFX9-LABEL: name: test_4xdouble_add_mul_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+ ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX9-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX9-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX9-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
+ ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
+ ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
+ ; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
+ ; GFX9-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
+ ; GFX9-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
+ ; GFX9-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX9-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
+ ; GFX9-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
+ ; GFX9-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
+ ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX9-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX9-NEXT: $vgpr6 = COPY [[UV6]](s32)
+ ; GFX9-NEXT: $vgpr7 = COPY [[UV7]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX9-DENORM-LABEL: name: test_4xdouble_add_mul_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+ ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX9-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX9-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX9-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX9-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX9-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX9-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX9-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX9-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX9-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX9-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
+ ; GFX9-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX9-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX9-DENORM-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
+ ; GFX9-DENORM-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
+ ; GFX9-DENORM-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
+ ; GFX9-DENORM-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
+ ; GFX9-DENORM-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
+ ; GFX9-DENORM-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
+ ; GFX9-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX9-DENORM-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
+ ; GFX9-DENORM-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
+ ; GFX9-DENORM-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr6 = COPY [[UV6]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX10-LABEL: name: test_4xdouble_add_mul_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+ ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX10-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX10-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX10-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX10-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
+ ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
+ ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
+ ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
+ ; GFX10-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
+ ; GFX10-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX10-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
+ ; GFX10-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
+ ; GFX10-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
+ ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX10-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX10-NEXT: $vgpr6 = COPY [[UV6]](s32)
+ ; GFX10-NEXT: $vgpr7 = COPY [[UV7]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ ;
+ ; GFX10-DENORM-LABEL: name: test_4xdouble_add_mul_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+ ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX10-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX10-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX10-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX10-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX10-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX10-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX10-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX10-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX10-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX10-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
+ ; GFX10-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX10-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX10-DENORM-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
+ ; GFX10-DENORM-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
+ ; GFX10-DENORM-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
+ ; GFX10-DENORM-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
+ ; GFX10-DENORM-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
+ ; GFX10-DENORM-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
+ ; GFX10-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX10-DENORM-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
+ ; GFX10-DENORM-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
+ ; GFX10-DENORM-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr6 = COPY [[UV6]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+ %4:_(s32) = COPY $vgpr0
+ %5:_(s32) = COPY $vgpr1
+ %6:_(s32) = COPY $vgpr2
+ %7:_(s32) = COPY $vgpr3
+ %8:_(s32) = COPY $vgpr4
+ %9:_(s32) = COPY $vgpr5
+ %10:_(s32) = COPY $vgpr6
+ %11:_(s32) = COPY $vgpr7
+ %28:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+ %29:_(s64) = G_MERGE_VALUES %6(s32), %7(s32)
+ %30:_(s64) = G_MERGE_VALUES %8(s32), %9(s32)
+ %31:_(s64) = G_MERGE_VALUES %10(s32), %11(s32)
+ %0:_(<4 x s64>) = G_BUILD_VECTOR %28(s64), %29(s64), %30(s64), %31(s64)
+ %12:_(s32) = COPY $vgpr8
+ %13:_(s32) = COPY $vgpr9
+ %14:_(s32) = COPY $vgpr10
+ %15:_(s32) = COPY $vgpr11
+ %16:_(s32) = COPY $vgpr12
+ %17:_(s32) = COPY $vgpr13
+ %18:_(s32) = COPY $vgpr14
+ %19:_(s32) = COPY $vgpr15
+ %32:_(s64) = G_MERGE_VALUES %12(s32), %13(s32)
+ %33:_(s64) = G_MERGE_VALUES %14(s32), %15(s32)
+ %34:_(s64) = G_MERGE_VALUES %16(s32), %17(s32)
+ %35:_(s64) = G_MERGE_VALUES %18(s32), %19(s32)
+ %1:_(<4 x s64>) = G_BUILD_VECTOR %32(s64), %33(s64), %34(s64), %35(s64)
+ %20:_(s32) = COPY $vgpr16
+ %21:_(s32) = COPY $vgpr17
+ %22:_(s32) = COPY $vgpr18
+ %23:_(s32) = COPY $vgpr19
+ %24:_(s32) = COPY $vgpr20
+ %25:_(s32) = COPY $vgpr21
+ %26:_(s32) = COPY $vgpr22
+ %27:_(s32) = COPY $vgpr23
+ %36:_(s64) = G_MERGE_VALUES %20(s32), %21(s32)
+ %37:_(s64) = G_MERGE_VALUES %22(s32), %23(s32)
+ %38:_(s64) = G_MERGE_VALUES %24(s32), %25(s32)
+ %39:_(s64) = G_MERGE_VALUES %26(s32), %27(s32)
+ %2:_(<4 x s64>) = G_BUILD_VECTOR %36(s64), %37(s64), %38(s64), %39(s64)
+ %40:_(<4 x s64>) = reassoc contract G_FMUL %0, %1
+ %41:_(<4 x s64>) = reassoc contract G_FADD %40, %2
+ %43:_(s32), %44:_(s32), %45:_(s32), %46:_(s32), %47:_(s32), %48:_(s32), %49:_(s32), %50:_(s32) = G_UNMERGE_VALUES %41(<4 x s64>)
+ $vgpr0 = COPY %43(s32)
+ $vgpr1 = COPY %44(s32)
+ $vgpr2 = COPY %45(s32)
+ $vgpr3 = COPY %46(s32)
+ $vgpr4 = COPY %47(s32)
+ $vgpr5 = COPY %48(s32)
+ $vgpr6 = COPY %49(s32)
+ $vgpr7 = COPY %50(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+...
+
+---
name: test_3xdouble_add_mul_rhs
body: |
bb.1.entry:
@@ -2198,49 +2475,6 @@ body: |
; GFX9-NEXT: $vgpr5 = COPY [[UV5]](s32)
; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
;
- ; GFX9-CONTRACT-LABEL: name: test_3xdouble_add_mul_rhs
- ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
- ; GFX9-CONTRACT-NEXT: {{ $}}
- ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
- ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX9-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX9-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX9-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
- ; GFX9-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX9-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX9-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX9-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX9-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX9-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX9-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX9-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>)
- ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX9-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
- ;
; GFX9-DENORM-LABEL: name: test_3xdouble_add_mul_rhs
; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
; GFX9-DENORM-NEXT: {{ $}}
@@ -2285,49 +2519,6 @@ body: |
; GFX9-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
;
- ; GFX9-UNSAFE-LABEL: name: test_3xdouble_add_mul_rhs
- ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
- ; GFX9-UNSAFE-NEXT: {{ $}}
- ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
- ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX9-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX9-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX9-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX9-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
- ; GFX9-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX9-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX9-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX9-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX9-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX9-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX9-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX9-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>)
- ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX9-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
- ;
; GFX10-LABEL: name: test_3xdouble_add_mul_rhs
; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
; GFX10-NEXT: {{ $}}
@@ -2372,49 +2563,6 @@ body: |
; GFX10-NEXT: $vgpr5 = COPY [[UV5]](s32)
; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
;
- ; GFX10-CONTRACT-LABEL: name: test_3xdouble_add_mul_rhs
- ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
- ; GFX10-CONTRACT-NEXT: {{ $}}
- ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
- ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX10-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX10-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX10-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
- ; GFX10-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX10-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX10-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX10-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX10-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX10-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX10-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX10-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>)
- ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX10-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
- ;
; GFX10-DENORM-LABEL: name: test_3xdouble_add_mul_rhs
; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
; GFX10-DENORM-NEXT: {{ $}}
@@ -2458,49 +2606,6 @@ body: |
; GFX10-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
; GFX10-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
- ;
- ; GFX10-UNSAFE-LABEL: name: test_3xdouble_add_mul_rhs
- ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
- ; GFX10-UNSAFE-NEXT: {{ $}}
- ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
- ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
- ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
- ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
- ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
- ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
- ; GFX10-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
- ; GFX10-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
- ; GFX10-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
- ; GFX10-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
- ; GFX10-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
- ; GFX10-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
- ; GFX10-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
- ; GFX10-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
- ; GFX10-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
- ; GFX10-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
- ; GFX10-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
- ; GFX10-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
- ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
- ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>)
- ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32)
- ; GFX10-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32)
- ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
%4:_(s32) = COPY $vgpr0
%5:_(s32) = COPY $vgpr1
%6:_(s32) = COPY $vgpr2
@@ -2542,3 +2647,222 @@ body: |
$vgpr5 = COPY %39(s32)
S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
...
+
+---
+name: test_3xdouble_add_mul_rhs_contract
+body: |
+ bb.1.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+
+ ; GFX9-LABEL: name: test_3xdouble_add_mul_rhs_contract
+ ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
+ ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX9-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
+ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX9-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX9-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX9-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX9-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+ ;
+ ; GFX9-DENORM-LABEL: name: test_3xdouble_add_mul_rhs_contract
+ ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+ ; GFX9-DENORM-NEXT: {{ $}}
+ ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
+ ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX9-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX9-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX9-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX9-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX9-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX9-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
+ ; GFX9-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX9-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX9-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX9-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX9-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX9-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX9-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX9-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX9-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+ ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+ ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>)
+ ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX9-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+ ;
+ ; GFX10-LABEL: name: test_3xdouble_add_mul_rhs_contract
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX10-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX10-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
+ ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX10-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX10-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX10-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+ ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>)
+ ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX10-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+ ;
+ ; GFX10-DENORM-LABEL: name: test_3xdouble_add_mul_rhs_contract
+ ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+ ; GFX10-DENORM-NEXT: {{ $}}
+ ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
+ ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+ ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+ ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+ ; GFX10-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+ ; GFX10-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+ ; GFX10-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+ ; GFX10-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+ ; GFX10-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+ ; GFX10-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
+ ; GFX10-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+ ; GFX10-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+ ; GFX10-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+ ; GFX10-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+ ; GFX10-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+ ; GFX10-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+ ; GFX10-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+ ; GFX10-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+ ; GFX10-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+ ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+ ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+ ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>)
+ ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
+ ; GFX10-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
+ ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+ %4:_(s32) = COPY $vgpr0
+ %5:_(s32) = COPY $vgpr1
+ %6:_(s32) = COPY $vgpr2
+ %7:_(s32) = COPY $vgpr3
+ %8:_(s32) = COPY $vgpr4
+ %9:_(s32) = COPY $vgpr5
+ %22:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+ %23:_(s64) = G_MERGE_VALUES %6(s32), %7(s32)
+ %24:_(s64) = G_MERGE_VALUES %8(s32), %9(s32)
+ %0:_(<3 x s64>) = G_BUILD_VECTOR %22(s64), %23(s64), %24(s64)
+ %10:_(s32) = COPY $vgpr6
+ %11:_(s32) = COPY $vgpr7
+ %12:_(s32) = COPY $vgpr8
+ %13:_(s32) = COPY $vgpr9
+ %14:_(s32) = COPY $vgpr10
+ %15:_(s32) = COPY $vgpr11
+ %25:_(s64) = G_MERGE_VALUES %10(s32), %11(s32)
+ %26:_(s64) = G_MERGE_VALUES %12(s32), %13(s32)
+ %27:_(s64) = G_MERGE_VALUES %14(s32), %15(s32)
+ %1:_(<3 x s64>) = G_BUILD_VECTOR %25(s64), %26(s64), %27(s64)
+ %16:_(s32) = COPY $vgpr12
+ %17:_(s32) = COPY $vgpr13
+ %18:_(s32) = COPY $vgpr14
+ %19:_(s32) = COPY $vgpr15
+ %20:_(s32) = COPY $vgpr16
+ %21:_(s32) = COPY $vgpr17
+ %28:_(s64) = G_MERGE_VALUES %16(s32), %17(s32)
+ %29:_(s64) = G_MERGE_VALUES %18(s32), %19(s32)
+ %30:_(s64) = G_MERGE_VALUES %20(s32), %21(s32)
+ %2:_(<3 x s64>) = G_BUILD_VECTOR %28(s64), %29(s64), %30(s64)
+ %31:_(<3 x s64>) = reassoc contract G_FMUL %0, %1
+ %32:_(<3 x s64>) = reassoc contract G_FADD %2, %31
+ %34:_(s32), %35:_(s32), %36:_(s32), %37:_(s32), %38:_(s32), %39:_(s32) = G_UNMERGE_VALUES %32(<3 x s64>)
+ $vgpr0 = COPY %34(s32)
+ $vgpr1 = COPY %35(s32)
+ $vgpr2 = COPY %36(s32)
+ $vgpr3 = COPY %37(s32)
+ $vgpr4 = COPY %38(s32)
+ $vgpr5 = COPY %39(s32)
+ S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
index 24dd535..3f6e3d8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
@@ -2,11 +2,9 @@
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -fp-contract=fast < %s | FileCheck -check-prefix=GFX9-CONTRACT %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GFX9-UNSAFE %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GFX10-UNSAFE %s
define float @test_f32_add_mul(float %x, float %y, float %z) {
; GFX9-LABEL: test_f32_add_mul:
@@ -28,12 +26,6 @@ define float @test_f32_add_mul(float %x, float %y, float %z) {
; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v1, v2
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-UNSAFE-LABEL: test_f32_add_mul:
-; GFX9-UNSAFE: ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2
-; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-LABEL: test_f32_add_mul:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -52,7 +44,6 @@ define float @test_f32_add_mul(float %x, float %y, float %z) {
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-DENORM-NEXT: v_mad_f32 v0, v0, v1, v2
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-UNSAFE-LABEL: test_f32_add_mul:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -64,6 +55,58 @@ define float @test_f32_add_mul(float %x, float %y, float %z) {
ret float %b
}
+define float @test_f32_add_mul_contract(float %x, float %y, float %z) {
+; GFX9-LABEL: test_f32_add_mul_contract:
+; GFX9: ; %bb.0: ; %.entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_f32_add_mul_contract:
+; GFX9-CONTRACT: ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_f32_add_mul_contract:
+; GFX9-DENORM: ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v1, v2
+; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_f32_add_mul_contract:
+; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_f32_add_mul_contract:
+; GFX10-CONTRACT: ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_f32_add_mul_contract:
+; GFX10-DENORM: ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_f32_add_mul_contract:
+; GFX9-UNSAFE: ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_f32_add_mul_contract:
+; GFX10-UNSAFE: ; %bb.0: ; %.entry
+; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+.entry:
+ %a = fmul contract float %x, %y
+ %b = fadd contract float %a, %z
+ ret float %b
+}
+
define float @test_f32_add_mul_rhs(float %x, float %y, float %z) {
; GFX9-LABEL: test_f32_add_mul_rhs:
; GFX9: ; %bb.0: ; %.entry
@@ -84,12 +127,6 @@ define float @test_f32_add_mul_rhs(float %x, float %y, float %z) {
; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v1, v2
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-UNSAFE-LABEL: test_f32_add_mul_rhs:
-; GFX9-UNSAFE: ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2
-; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-LABEL: test_f32_add_mul_rhs:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -108,7 +145,6 @@ define float @test_f32_add_mul_rhs(float %x, float %y, float %z) {
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-DENORM-NEXT: v_mad_f32 v0, v0, v1, v2
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-UNSAFE-LABEL: test_f32_add_mul_rhs:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -120,6 +156,58 @@ define float @test_f32_add_mul_rhs(float %x, float %y, float %z) {
ret float %b
}
+define float @test_f32_add_mul_rhs_contract(float %x, float %y, float %z) {
+; GFX9-LABEL: test_f32_add_mul_rhs_contract:
+; GFX9: ; %bb.0: ; %.entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_f32_add_mul_rhs_contract:
+; GFX9-CONTRACT: ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_f32_add_mul_rhs_contract:
+; GFX9-DENORM: ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v1, v2
+; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_f32_add_mul_rhs_contract:
+; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_f32_add_mul_rhs_contract:
+; GFX10-CONTRACT: ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_f32_add_mul_rhs_contract:
+; GFX10-DENORM: ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_f32_add_mul_rhs_contract:
+; GFX9-UNSAFE: ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_f32_add_mul_rhs_contract:
+; GFX10-UNSAFE: ; %bb.0: ; %.entry
+; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+.entry:
+ %a = fmul contract float %x, %y
+ %b = fadd contract float %z, %a
+ ret float %b
+}
+
define float @test_add_mul_multiple_defs_z(float %x, float %y, ptr addrspace(1) %vec_ptr) {
; GFX9-LABEL: test_add_mul_multiple_defs_z:
; GFX9: ; %bb.0: ; %.entry
@@ -147,14 +235,6 @@ define float @test_add_mul_multiple_defs_z(float %x, float %y, ptr addrspace(1)
; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v2
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-UNSAFE-LABEL: test_add_mul_multiple_defs_z:
-; GFX9-UNSAFE: ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: global_load_dword v2, v[2:3], off offset:4
-; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0)
-; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2
-; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-LABEL: test_add_mul_multiple_defs_z:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -181,7 +261,6 @@ define float @test_add_mul_multiple_defs_z(float %x, float %y, ptr addrspace(1)
; GFX10-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1
; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v2
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-UNSAFE-LABEL: test_add_mul_multiple_defs_z:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -198,17 +277,16 @@ define float @test_add_mul_multiple_defs_z(float %x, float %y, ptr addrspace(1)
ret float %b
}
-define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, ptr addrspace(1) %vec_ptr) {
-; GFX9-LABEL: test_add_mul_rhs_multiple_defs_z:
+define float @test_add_mul_multiple_defs_z_contract(float %x, float %y, ptr addrspace(1) %vec_ptr) {
+; GFX9-LABEL: test_add_mul_multiple_defs_z_contract:
; GFX9: ; %bb.0: ; %.entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-CONTRACT-LABEL: test_add_mul_rhs_multiple_defs_z:
+; GFX9-CONTRACT-LABEL: test_add_mul_multiple_defs_z_contract:
; GFX9-CONTRACT: ; %bb.0: ; %.entry
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-CONTRACT-NEXT: global_load_dword v2, v[2:3], off offset:4
@@ -216,7 +294,7 @@ define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, ptr addrspace
; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-DENORM-LABEL: test_add_mul_rhs_multiple_defs_z:
+; GFX9-DENORM-LABEL: test_add_mul_multiple_defs_z_contract:
; GFX9-DENORM: ; %bb.0: ; %.entry
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-DENORM-NEXT: global_load_dword v2, v[2:3], off offset:4
@@ -225,13 +303,81 @@ define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, ptr addrspace
; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v2
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-UNSAFE-LABEL: test_add_mul_rhs_multiple_defs_z:
+; GFX10-LABEL: test_add_mul_multiple_defs_z_contract:
+; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v2, v[2:3], off offset:4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX10-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_add_mul_multiple_defs_z_contract:
+; GFX10-CONTRACT: ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT: global_load_dword v2, v[2:3], off offset:4
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_add_mul_multiple_defs_z_contract:
+; GFX10-DENORM: ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v2, v[2:3], off offset:4
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_add_mul_multiple_defs_z_contract:
; GFX9-UNSAFE: ; %bb.0: ; %.entry
; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-UNSAFE-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0)
; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_add_mul_multiple_defs_z_contract:
+; GFX10-UNSAFE: ; %bb.0: ; %.entry
+; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-UNSAFE-NEXT: global_load_dword v2, v[2:3], off offset:4
+; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0)
+; GFX10-UNSAFE-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX10-UNSAFE-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+.entry:
+ %a = fmul contract float %x, %y
+ %vec = load <2 x float>, ptr addrspace(1) %vec_ptr
+ %z = extractelement <2 x float> %vec, i64 1
+ %b = fadd contract float %a, %z
+ ret float %b
+}
+
+define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, ptr addrspace(1) %vec_ptr) {
+; GFX9-LABEL: test_add_mul_rhs_multiple_defs_z:
+; GFX9: ; %bb.0: ; %.entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4
+; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_add_mul_rhs_multiple_defs_z:
+; GFX9-CONTRACT: ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT: global_load_dword v2, v[2:3], off offset:4
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_add_mul_rhs_multiple_defs_z:
+; GFX9-DENORM: ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v2, v[2:3], off offset:4
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1
+; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_add_mul_rhs_multiple_defs_z:
; GFX10: ; %bb.0: ; %.entry
@@ -259,7 +405,6 @@ define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, ptr addrspace
; GFX10-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1
; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v2
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-UNSAFE-LABEL: test_add_mul_rhs_multiple_defs_z:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -296,12 +441,6 @@ define half @test_half_add_mul(half %x, half %y, half %z) {
; GFX9-DENORM-NEXT: v_mad_legacy_f16 v0, v0, v1, v2
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-UNSAFE-LABEL: test_half_add_mul:
-; GFX9-UNSAFE: ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: v_fma_f16 v0, v0, v1, v2
-; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-LABEL: test_half_add_mul:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -321,7 +460,6 @@ define half @test_half_add_mul(half %x, half %y, half %z) {
; GFX10-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX10-DENORM-NEXT: v_add_f16_e32 v0, v0, v2
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-UNSAFE-LABEL: test_half_add_mul:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -333,6 +471,59 @@ define half @test_half_add_mul(half %x, half %y, half %z) {
ret half %b
}
+define half @test_half_add_mul_contract(half %x, half %y, half %z) {
+; GFX9-LABEL: test_half_add_mul_contract:
+; GFX9: ; %bb.0: ; %.entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_half_add_mul_contract:
+; GFX9-CONTRACT: ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_half_add_mul_contract:
+; GFX9-DENORM: ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT: v_mad_legacy_f16 v0, v0, v1, v2
+; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_half_add_mul_contract:
+; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_half_add_mul_contract:
+; GFX10-CONTRACT: ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_half_add_mul_contract:
+; GFX10-DENORM: ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX10-DENORM-NEXT: v_add_f16_e32 v0, v0, v2
+; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_half_add_mul_contract:
+; GFX9-UNSAFE: ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_half_add_mul_contract:
+; GFX10-UNSAFE: ; %bb.0: ; %.entry
+; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-UNSAFE-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+.entry:
+ %a = fmul contract half %x, %y
+ %b = fadd contract half %a, %z
+ ret half %b
+}
+
define half @test_half_add_mul_rhs(half %x, half %y, half %z) {
; GFX9-LABEL: test_half_add_mul_rhs:
; GFX9: ; %bb.0: ; %.entry
@@ -353,12 +544,6 @@ define half @test_half_add_mul_rhs(half %x, half %y, half %z) {
; GFX9-DENORM-NEXT: v_mad_legacy_f16 v0, v0, v1, v2
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-UNSAFE-LABEL: test_half_add_mul_rhs:
-; GFX9-UNSAFE: ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: v_fma_f16 v0, v0, v1, v2
-; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-LABEL: test_half_add_mul_rhs:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -378,7 +563,6 @@ define half @test_half_add_mul_rhs(half %x, half %y, half %z) {
; GFX10-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX10-DENORM-NEXT: v_add_f16_e32 v0, v2, v0
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-UNSAFE-LABEL: test_half_add_mul_rhs:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -390,6 +574,59 @@ define half @test_half_add_mul_rhs(half %x, half %y, half %z) {
ret half %b
}
+define half @test_half_add_mul_rhs_contract(half %x, half %y, half %z) {
+; GFX9-LABEL: test_half_add_mul_rhs_contract:
+; GFX9: ; %bb.0: ; %.entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_half_add_mul_rhs_contract:
+; GFX9-CONTRACT: ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_half_add_mul_rhs_contract:
+; GFX9-DENORM: ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT: v_mad_legacy_f16 v0, v0, v1, v2
+; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_half_add_mul_rhs_contract:
+; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_half_add_mul_rhs_contract:
+; GFX10-CONTRACT: ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_half_add_mul_rhs_contract:
+; GFX10-DENORM: ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX10-DENORM-NEXT: v_add_f16_e32 v0, v2, v0
+; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_half_add_mul_rhs_contract:
+; GFX9-UNSAFE: ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_half_add_mul_rhs_contract:
+; GFX10-UNSAFE: ; %bb.0: ; %.entry
+; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-UNSAFE-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+.entry:
+ %a = fmul contract half %x, %y
+ %b = fadd contract half %z, %a
+ ret half %b
+}
+
define double @test_double_add_mul(double %x, double %y, double %z) {
; GFX9-LABEL: test_double_add_mul:
; GFX9: ; %bb.0: ; %.entry
@@ -411,12 +648,6 @@ define double @test_double_add_mul(double %x, double %y, double %z) {
; GFX9-DENORM-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5]
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-UNSAFE-LABEL: test_double_add_mul:
-; GFX9-UNSAFE: ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-LABEL: test_double_add_mul:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -436,15 +667,61 @@ define double @test_double_add_mul(double %x, double %y, double %z) {
; GFX10-DENORM-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
; GFX10-DENORM-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5]
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+.entry:
+ %a = fmul double %x, %y
+ %b = fadd double %a, %z
+ ret double %b
+}
+
+define double @test_double_add_mul_contract(double %x, double %y, double %z) {
+; GFX9-LABEL: test_double_add_mul_contract:
+; GFX9: ; %bb.0: ; %.entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_double_add_mul_contract:
+; GFX9-CONTRACT: ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_double_add_mul_contract:
+; GFX9-DENORM: ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
+; GFX10-LABEL: test_double_add_mul_contract:
+; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_double_add_mul_contract:
+; GFX10-CONTRACT: ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_double_add_mul_contract:
+; GFX10-DENORM: ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_double_add_mul_contract:
+; GFX9-UNSAFE: ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
; GFX10-UNSAFE-LABEL: test_double_add_mul:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31]
.entry:
- %a = fmul double %x, %y
- %b = fadd double %a, %z
+ %a = fmul contract double %x, %y
+ %b = fadd contract double %a, %z
ret double %b
}
@@ -469,12 +746,6 @@ define double @test_double_add_mul_rhs(double %x, double %y, double %z) {
; GFX9-DENORM-NEXT: v_add_f64 v[0:1], v[4:5], v[0:1]
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-UNSAFE-LABEL: test_double_add_mul_rhs:
-; GFX9-UNSAFE: ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-LABEL: test_double_add_mul_rhs:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -494,15 +765,61 @@ define double @test_double_add_mul_rhs(double %x, double %y, double %z) {
; GFX10-DENORM-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
; GFX10-DENORM-NEXT: v_add_f64 v[0:1], v[4:5], v[0:1]
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+.entry:
+ %a = fmul double %x, %y
+ %b = fadd double %z, %a
+ ret double %b
+}
+
+define double @test_double_add_mul_rhs_contract(double %x, double %y, double %z) {
+; GFX9-LABEL: test_double_add_mul_rhs_contract:
+; GFX9: ; %bb.0: ; %.entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_double_add_mul_rhs_contract:
+; GFX9-CONTRACT: ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_double_add_mul_rhs_contract:
+; GFX9-DENORM: ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-UNSAFE-LABEL: test_double_add_mul_rhs:
+; GFX10-LABEL: test_double_add_mul_rhs_contract:
+; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_double_add_mul_rhs_contract:
+; GFX10-CONTRACT: ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_double_add_mul_rhs_contract:
+; GFX10-DENORM: ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_double_add_mul_rhs_contract:
+; GFX9-UNSAFE: ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_double_add_mul_rhs_contract:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31]
.entry:
- %a = fmul double %x, %y
- %b = fadd double %z, %a
+ %a = fmul contract double %x, %y
+ %b = fadd contract double %z, %a
ret double %b
}
@@ -538,15 +855,6 @@ define <4 x float> @test_4xfloat_add_mul(<4 x float> %x, <4 x float> %y, <4 x fl
; GFX9-DENORM-NEXT: v_mad_f32 v3, v3, v7, v11
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-UNSAFE-LABEL: test_4xfloat_add_mul:
-; GFX9-UNSAFE: ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v4, v8
-; GFX9-UNSAFE-NEXT: v_fma_f32 v1, v1, v5, v9
-; GFX9-UNSAFE-NEXT: v_fma_f32 v2, v2, v6, v10
-; GFX9-UNSAFE-NEXT: v_fma_f32 v3, v3, v7, v11
-; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-LABEL: test_4xfloat_add_mul:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -577,8 +885,75 @@ define <4 x float> @test_4xfloat_add_mul(<4 x float> %x, <4 x float> %y, <4 x fl
; GFX10-DENORM-NEXT: v_mad_f32 v2, v2, v6, v10
; GFX10-DENORM-NEXT: v_mad_f32 v3, v3, v7, v11
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+.entry:
+ %a = fmul <4 x float> %x, %y
+ %b = fadd <4 x float> %a, %z
+ ret <4 x float> %b
+}
+
+define <4 x float> @test_4xfloat_add_mul_contract(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
+; GFX9-LABEL: test_4xfloat_add_mul_contract:
+; GFX9: ; %bb.0: ; %.entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_fma_f32 v0, v0, v4, v8
+; GFX9-NEXT: v_fma_f32 v1, v1, v5, v9
+; GFX9-NEXT: v_fma_f32 v2, v2, v6, v10
+; GFX9-NEXT: v_fma_f32 v3, v3, v7, v11
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_4xfloat_add_mul_contract:
+; GFX9-CONTRACT: ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v4, v8
+; GFX9-CONTRACT-NEXT: v_fma_f32 v1, v1, v5, v9
+; GFX9-CONTRACT-NEXT: v_fma_f32 v2, v2, v6, v10
+; GFX9-CONTRACT-NEXT: v_fma_f32 v3, v3, v7, v11
+; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_4xfloat_add_mul_contract:
+; GFX9-DENORM: ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v4, v8
+; GFX9-DENORM-NEXT: v_mad_f32 v1, v1, v5, v9
+; GFX9-DENORM-NEXT: v_mad_f32 v2, v2, v6, v10
+; GFX9-DENORM-NEXT: v_mad_f32 v3, v3, v7, v11
+; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-UNSAFE-LABEL: test_4xfloat_add_mul:
+; GFX10-LABEL: test_4xfloat_add_mul_contract:
+; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_fma_f32 v0, v0, v4, v8
+; GFX10-NEXT: v_fma_f32 v1, v1, v5, v9
+; GFX10-NEXT: v_fma_f32 v2, v2, v6, v10
+; GFX10-NEXT: v_fma_f32 v3, v3, v7, v11
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_4xfloat_add_mul_contract:
+; GFX10-CONTRACT: ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT: v_fma_f32 v0, v0, v4, v8
+; GFX10-CONTRACT-NEXT: v_fma_f32 v1, v1, v5, v9
+; GFX10-CONTRACT-NEXT: v_fma_f32 v2, v2, v6, v10
+; GFX10-CONTRACT-NEXT: v_fma_f32 v3, v3, v7, v11
+; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_4xfloat_add_mul_contract:
+; GFX10-DENORM: ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT: v_fma_f32 v0, v0, v4, v8
+; GFX10-DENORM-NEXT: v_fma_f32 v1, v1, v5, v9
+; GFX10-DENORM-NEXT: v_fma_f32 v2, v2, v6, v10
+; GFX10-DENORM-NEXT: v_fma_f32 v3, v3, v7, v11
+; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_4xfloat_add_mul_contract:
+; GFX9-UNSAFE: ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v4, v8
+; GFX9-UNSAFE-NEXT: v_fma_f32 v1, v1, v5, v9
+; GFX9-UNSAFE-NEXT: v_fma_f32 v2, v2, v6, v10
+; GFX9-UNSAFE-NEXT: v_fma_f32 v3, v3, v7, v11
+; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_4xfloat_add_mul_contract:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-UNSAFE-NEXT: v_fma_f32 v0, v0, v4, v8
@@ -587,8 +962,8 @@ define <4 x float> @test_4xfloat_add_mul(<4 x float> %x, <4 x float> %y, <4 x fl
; GFX10-UNSAFE-NEXT: v_fma_f32 v3, v3, v7, v11
; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31]
.entry:
- %a = fmul <4 x float> %x, %y
- %b = fadd <4 x float> %a, %z
+ %a = fmul contract <4 x float> %x, %y
+ %b = fadd contract <4 x float> %a, %z
ret <4 x float> %b
}
@@ -620,14 +995,6 @@ define <3 x float> @test_3xfloat_add_mul_rhs(<3 x float> %x, <3 x float> %y, <3
; GFX9-DENORM-NEXT: v_mad_f32 v2, v2, v5, v8
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-UNSAFE-LABEL: test_3xfloat_add_mul_rhs:
-; GFX9-UNSAFE: ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v3, v6
-; GFX9-UNSAFE-NEXT: v_fma_f32 v1, v1, v4, v7
-; GFX9-UNSAFE-NEXT: v_fma_f32 v2, v2, v5, v8
-; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-LABEL: test_3xfloat_add_mul_rhs:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -654,8 +1021,68 @@ define <3 x float> @test_3xfloat_add_mul_rhs(<3 x float> %x, <3 x float> %y, <3
; GFX10-DENORM-NEXT: v_mad_f32 v1, v1, v4, v7
; GFX10-DENORM-NEXT: v_mad_f32 v2, v2, v5, v8
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+.entry:
+ %a = fmul <3 x float> %x, %y
+ %b = fadd <3 x float> %z, %a
+ ret <3 x float> %b
+}
+
+define <3 x float> @test_3xfloat_add_mul_rhs_contract(<3 x float> %x, <3 x float> %y, <3 x float> %z) {
+; GFX9-LABEL: test_3xfloat_add_mul_rhs_contract:
+; GFX9: ; %bb.0: ; %.entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_fma_f32 v0, v0, v3, v6
+; GFX9-NEXT: v_fma_f32 v1, v1, v4, v7
+; GFX9-NEXT: v_fma_f32 v2, v2, v5, v8
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_3xfloat_add_mul_rhs_contract:
+; GFX9-CONTRACT: ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v3, v6
+; GFX9-CONTRACT-NEXT: v_fma_f32 v1, v1, v4, v7
+; GFX9-CONTRACT-NEXT: v_fma_f32 v2, v2, v5, v8
+; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-UNSAFE-LABEL: test_3xfloat_add_mul_rhs:
+; GFX9-DENORM-LABEL: test_3xfloat_add_mul_rhs_contract:
+; GFX9-DENORM: ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v3, v6
+; GFX9-DENORM-NEXT: v_mad_f32 v1, v1, v4, v7
+; GFX9-DENORM-NEXT: v_mad_f32 v2, v2, v5, v8
+; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_3xfloat_add_mul_rhs_contract:
+; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_fma_f32 v0, v0, v3, v6
+; GFX10-NEXT: v_fma_f32 v1, v1, v4, v7
+; GFX10-NEXT: v_fma_f32 v2, v2, v5, v8
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_3xfloat_add_mul_rhs_contract:
+; GFX10-CONTRACT: ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT: v_fma_f32 v0, v0, v3, v6
+; GFX10-CONTRACT-NEXT: v_fma_f32 v1, v1, v4, v7
+; GFX10-CONTRACT-NEXT: v_fma_f32 v2, v2, v5, v8
+; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_3xfloat_add_mul_rhs_contract:
+; GFX10-DENORM: ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT: v_fma_f32 v0, v0, v3, v6
+; GFX10-DENORM-NEXT: v_fma_f32 v1, v1, v4, v7
+; GFX10-DENORM-NEXT: v_fma_f32 v2, v2, v5, v8
+; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_3xfloat_add_mul_rhs_contract:
+; GFX9-UNSAFE: ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v3, v6
+; GFX9-UNSAFE-NEXT: v_fma_f32 v1, v1, v4, v7
+; GFX9-UNSAFE-NEXT: v_fma_f32 v2, v2, v5, v8
+; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_3xfloat_add_mul_rhs_contract:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-UNSAFE-NEXT: v_fma_f32 v0, v0, v3, v6
@@ -663,8 +1090,8 @@ define <3 x float> @test_3xfloat_add_mul_rhs(<3 x float> %x, <3 x float> %y, <3
; GFX10-UNSAFE-NEXT: v_fma_f32 v2, v2, v5, v8
; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31]
.entry:
- %a = fmul <3 x float> %x, %y
- %b = fadd <3 x float> %z, %a
+ %a = fmul contract <3 x float> %x, %y
+ %b = fadd contract <3 x float> %z, %a
ret <3 x float> %b
}
@@ -694,13 +1121,6 @@ define <4 x half> @test_4xhalf_add_mul(<4 x half> %x, <4 x half> %y, <4 x half>
; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v1, v5
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-UNSAFE-LABEL: test_4xhalf_add_mul:
-; GFX9-UNSAFE: ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4
-; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5
-; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-LABEL: test_4xhalf_add_mul:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -725,7 +1145,6 @@ define <4 x half> @test_4xhalf_add_mul(<4 x half> %x, <4 x half> %y, <4 x half>
; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v0, v4
; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v1, v5
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-UNSAFE-LABEL: test_4xhalf_add_mul:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -738,6 +1157,70 @@ define <4 x half> @test_4xhalf_add_mul(<4 x half> %x, <4 x half> %y, <4 x half>
ret <4 x half> %b
}
+define <4 x half> @test_4xhalf_add_mul_contract(<4 x half> %x, <4 x half> %y, <4 x half> %z) {
+; GFX9-LABEL: test_4xhalf_add_mul_contract:
+; GFX9: ; %bb.0: ; %.entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_4xhalf_add_mul_contract:
+; GFX9-CONTRACT: ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_4xhalf_add_mul_contract:
+; GFX9-DENORM: ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
+; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
+; GFX9-DENORM-NEXT: v_pk_add_f16 v0, v0, v4
+; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v1, v5
+; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_4xhalf_add_mul_contract:
+; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_4xhalf_add_mul_contract:
+; GFX10-CONTRACT: ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_4xhalf_add_mul_contract:
+; GFX10-DENORM: ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
+; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
+; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v0, v4
+; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v1, v5
+; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_4xhalf_add_mul_contract:
+; GFX9-UNSAFE: ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_4xhalf_add_mul_contract:
+; GFX10-UNSAFE: ; %bb.0: ; %.entry
+; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+.entry:
+ %a = fmul contract <4 x half> %x, %y
+ %b = fadd contract <4 x half> %a, %z
+ ret <4 x half> %b
+}
+
define <3 x half> @test_3xhalf_add_mul_rhs(<3 x half> %x, <3 x half> %y, <3 x half> %z) {
; GFX9-LABEL: test_3xhalf_add_mul_rhs:
; GFX9: ; %bb.0: ; %.entry
@@ -764,13 +1247,6 @@ define <3 x half> @test_3xhalf_add_mul_rhs(<3 x half> %x, <3 x half> %y, <3 x ha
; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v5, v1
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-UNSAFE-LABEL: test_3xhalf_add_mul_rhs:
-; GFX9-UNSAFE: ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4
-; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5
-; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-LABEL: test_3xhalf_add_mul_rhs:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -795,16 +1271,73 @@ define <3 x half> @test_3xhalf_add_mul_rhs(<3 x half> %x, <3 x half> %y, <3 x ha
; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v4, v0
; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v5, v1
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+.entry:
+ %a = fmul <3 x half> %x, %y
+ %b = fadd <3 x half> %z, %a
+ ret <3 x half> %b
+}
+
+define <3 x half> @test_3xhalf_add_mul_rhs_contract(<3 x half> %x, <3 x half> %y, <3 x half> %z) {
+; GFX9-LABEL: test_3xhalf_add_mul_rhs_contract:
+; GFX9: ; %bb.0: ; %.entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_3xhalf_add_mul_rhs_contract:
+; GFX9-CONTRACT: ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_3xhalf_add_mul_rhs_contract:
+; GFX9-DENORM: ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
+; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
+; GFX9-DENORM-NEXT: v_pk_add_f16 v0, v4, v0
+; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v5, v1
+; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_3xhalf_add_mul_rhs_contract:
+; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-UNSAFE-LABEL: test_3xhalf_add_mul_rhs:
+; GFX10-CONTRACT-LABEL: test_3xhalf_add_mul_rhs_contract:
+; GFX10-CONTRACT: ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_3xhalf_add_mul_rhs_contract:
+; GFX10-DENORM: ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
+; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
+; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v4, v0
+; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v5, v1
+; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_3xhalf_add_mul_rhs_contract:
+; GFX9-UNSAFE: ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_3xhalf_add_mul_rhs_contract:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4
; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5
; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31]
.entry:
- %a = fmul <3 x half> %x, %y
- %b = fadd <3 x half> %z, %a
+ %a = fmul contract <3 x half> %x, %y
+ %b = fadd contract <3 x half> %z, %a
ret <3 x half> %b
}
@@ -844,15 +1377,6 @@ define <4 x double> @test_4xdouble_add_mul(<4 x double> %x, <4 x double> %y, <4
; GFX9-DENORM-NEXT: v_add_f64 v[6:7], v[6:7], v[22:23]
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-UNSAFE-LABEL: test_4xdouble_add_mul:
-; GFX9-UNSAFE: ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
-; GFX9-UNSAFE-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
-; GFX9-UNSAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
-; GFX9-UNSAFE-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
-; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-LABEL: test_4xdouble_add_mul:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -887,7 +1411,14 @@ define <4 x double> @test_4xdouble_add_mul(<4 x double> %x, <4 x double> %y, <4
; GFX10-DENORM-NEXT: v_add_f64 v[4:5], v[4:5], v[20:21]
; GFX10-DENORM-NEXT: v_add_f64 v[6:7], v[6:7], v[22:23]
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
-;
+; GFX9-UNSAFE-LABEL: test_4xdouble_add_mul:
+; GFX9-UNSAFE: ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX9-UNSAFE-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX9-UNSAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX9-UNSAFE-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
; GFX10-UNSAFE-LABEL: test_4xdouble_add_mul:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -902,6 +1433,66 @@ define <4 x double> @test_4xdouble_add_mul(<4 x double> %x, <4 x double> %y, <4
ret <4 x double> %b
}
+define <4 x double> @test_4xdouble_add_mul_contract(<4 x double> %x, <4 x double> %y, <4 x double> %z) {
+; GFX9-LABEL: test_4xdouble_add_mul_contract:
+; GFX9: ; %bb.0: ; %.entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX9-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX9-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_4xdouble_add_mul_contract:
+; GFX9-CONTRACT: ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX9-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX9-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX9-CONTRACT-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_4xdouble_add_mul_contract:
+; GFX9-DENORM: ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX9-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX9-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX9-DENORM-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_4xdouble_add_mul_contract:
+; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX10-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_4xdouble_add_mul_contract:
+; GFX10-CONTRACT: ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX10-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX10-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX10-CONTRACT-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_4xdouble_add_mul_contract:
+; GFX10-DENORM: ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX10-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX10-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX10-DENORM-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+.entry:
+ %a = fmul contract <4 x double> %x, %y
+ %b = fadd contract <4 x double> %a, %z
+ ret <4 x double> %b
+}
+
define <3 x double> @test_3xdouble_add_mul_rhs(<3 x double> %x, <3 x double> %y, <3 x double> %z) {
; GFX9-LABEL: test_3xdouble_add_mul_rhs:
; GFX9: ; %bb.0: ; %.entry
@@ -933,14 +1524,6 @@ define <3 x double> @test_3xdouble_add_mul_rhs(<3 x double> %x, <3 x double> %y,
; GFX9-DENORM-NEXT: v_add_f64 v[4:5], v[16:17], v[4:5]
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-UNSAFE-LABEL: test_3xdouble_add_mul_rhs:
-; GFX9-UNSAFE: ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
-; GFX9-UNSAFE-NEXT: v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
-; GFX9-UNSAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
-; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
-;
; GFX10-LABEL: test_3xdouble_add_mul_rhs:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -970,7 +1553,13 @@ define <3 x double> @test_3xdouble_add_mul_rhs(<3 x double> %x, <3 x double> %y,
; GFX10-DENORM-NEXT: v_add_f64 v[2:3], v[14:15], v[2:3]
; GFX10-DENORM-NEXT: v_add_f64 v[4:5], v[16:17], v[4:5]
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
-;
+; GFX9-UNSAFE-LABEL: test_3xdouble_add_mul_rhs:
+; GFX9-UNSAFE: ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
+; GFX9-UNSAFE-NEXT: v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
+; GFX9-UNSAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
; GFX10-UNSAFE-LABEL: test_3xdouble_add_mul_rhs:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -983,3 +1572,57 @@ define <3 x double> @test_3xdouble_add_mul_rhs(<3 x double> %x, <3 x double> %y,
%b = fadd <3 x double> %z, %a
ret <3 x double> %b
}
+
+define <3 x double> @test_3xdouble_add_mul_rhs_contract(<3 x double> %x, <3 x double> %y, <3 x double> %z) {
+; GFX9-LABEL: test_3xdouble_add_mul_rhs_contract:
+; GFX9: ; %bb.0: ; %.entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
+; GFX9-NEXT: v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
+; GFX9-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_3xdouble_add_mul_rhs_contract:
+; GFX9-CONTRACT: ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
+; GFX9-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
+; GFX9-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_3xdouble_add_mul_rhs_contract:
+; GFX9-DENORM: ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
+; GFX9-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
+; GFX9-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_3xdouble_add_mul_rhs_contract:
+; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
+; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
+; GFX10-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_3xdouble_add_mul_rhs_contract:
+; GFX10-CONTRACT: ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
+; GFX10-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
+; GFX10-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_3xdouble_add_mul_rhs_contract:
+; GFX10-DENORM: ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
+; GFX10-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
+; GFX10-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
+.entry:
+ %a = fmul contract <3 x double> %x, %y
+ %b = fadd contract <3 x double> %z, %a
+ ret <3 x double> %b
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir
index 2845a63..d9ac9a7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir
@@ -24,8 +24,8 @@ body: |
%ptr:_(p1) = COPY $vgpr2_vgpr3
%vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
%el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
- %6:_(s32) = G_FMUL %0, %1
- %7:_(s32) = G_FADD %6, %el1
+ %6:_(s32) = contract G_FMUL %0, %1
+ %7:_(s32) = contract G_FADD %6, %el1
$vgpr0 = COPY %7(s32)
...
@@ -54,8 +54,8 @@ body: |
%ptr:_(p1) = COPY $vgpr2_vgpr3
%vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
%el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
- %6:_(s32) = G_FMUL %0, %1
- %7:_(s32) = G_FADD %el1, %6
+ %6:_(s32) = contract G_FMUL %0, %1
+ %7:_(s32) = contract G_FADD %el1, %6
$vgpr0 = COPY %7(s32)
...
@@ -233,10 +233,10 @@ body: |
%7:_(s16) = G_TRUNC %6(s32)
%8:_(s32) = COPY $vgpr5
%9:_(s16) = G_TRUNC %8(s32)
- %10:_(s16) = G_FMUL %7, %9
+ %10:_(s16) = contract G_FMUL %7, %9
%11:_(s32) = G_FPEXT %10(s16)
%12:_(s32) = G_FMA %0, %1, %11
- %13:_(s32) = G_FADD %12, %el1
+ %13:_(s32) = contract G_FADD %12, %el1
$vgpr0 = COPY %13(s32)
...
@@ -282,11 +282,11 @@ body: |
%9:_(s16) = G_TRUNC %8(s32)
%10:_(s32) = COPY $vgpr5
%11:_(s16) = G_TRUNC %10(s32)
- %12:_(s16) = G_FMUL %9, %11
- %13:_(s16) = G_FMUL %1, %3
- %14:_(s16) = G_FADD %13, %12
+ %12:_(s16) = contract G_FMUL %9, %11
+ %13:_(s16) = contract G_FMUL %1, %3
+ %14:_(s16) = contract G_FADD %13, %12
%15:_(s32) = G_FPEXT %14(s16)
- %16:_(s32) = G_FADD %15, %el1
+ %16:_(s32) = contract G_FADD %15, %el1
$vgpr0 = COPY %16(s32)
...
@@ -326,10 +326,10 @@ body: |
%7:_(s16) = G_TRUNC %6(s32)
%8:_(s32) = COPY $vgpr5
%9:_(s16) = G_TRUNC %8(s32)
- %10:_(s16) = G_FMUL %7, %9
+ %10:_(s16) = contract G_FMUL %7, %9
%11:_(s32) = G_FPEXT %10(s16)
%12:_(s32) = G_FMA %4, %5, %11
- %13:_(s32) = G_FADD %el1, %12
+ %13:_(s32) = contract G_FADD %el1, %12
$vgpr0 = COPY %13(s32)
...
@@ -375,11 +375,11 @@ body: |
%9:_(s16) = G_TRUNC %8(s32)
%10:_(s32) = COPY $vgpr5
%11:_(s16) = G_TRUNC %10(s32)
- %12:_(s16) = G_FMUL %9, %11
- %13:_(s16) = G_FMUL %5, %7
- %14:_(s16) = G_FADD %13, %12
+ %12:_(s16) = contract G_FMUL %9, %11
+ %13:_(s16) = contract G_FMUL %5, %7
+ %14:_(s16) = contract G_FADD %13, %12
%15:_(s32) = G_FPEXT %14(s16)
- %16:_(s32) = G_FADD %el1, %15
+ %16:_(s32) = contract G_FADD %el1, %15
$vgpr0 = COPY %16(s32)
...
@@ -409,8 +409,8 @@ body: |
%ptr:_(p1) = COPY $vgpr0_vgpr1
%vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
%el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
- %6:_(s32) = G_FMUL %0, %1
- %7:_(s32) = G_FSUB %6, %el1
+ %6:_(s32) = contract G_FMUL %0, %1
+ %7:_(s32) = contract G_FSUB %6, %el1
$vgpr0 = COPY %7(s32)
...
@@ -440,7 +440,7 @@ body: |
%ptr:_(p1) = COPY $vgpr2_vgpr3
%vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
%el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
- %6:_(s32) = G_FMUL %0, %1
- %7:_(s32) = G_FSUB %el1, %6
+ %6:_(s32) = contract G_FMUL %0, %1
+ %7:_(s32) = contract G_FSUB %el1, %6
$vgpr0 = COPY %7(s32)
...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
index c4d57ac..da25ac0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
@@ -12,7 +12,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da
; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX942-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr)
+ ; GFX942-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, !noalias.addrspace !0)
; GFX942-NEXT: S_ENDPGM 0
;
; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic
@@ -23,7 +23,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr)
+ ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, !noalias.addrspace !0)
; GFX11-NEXT: S_ENDPGM 0
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr %ptr, float %data)
ret void
@@ -38,7 +38,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data
; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX942-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr)
+ ; GFX942-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, !noalias.addrspace !0)
; GFX942-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
; GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
@@ -50,7 +50,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr)
+ ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, !noalias.addrspace !0)
; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr %ptr, float %data)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll
index c82ae2fb..bf36979 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll
@@ -13,7 +13,7 @@ define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %d
; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: FLAT_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr)
+ ; GFX90A_GFX942-NEXT: FLAT_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, !noalias.addrspace !0)
; GFX90A_GFX942-NEXT: S_ENDPGM 0
%ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
ret void
@@ -30,7 +30,7 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %da
; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr)
+ ; GFX90A_GFX942-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, !noalias.addrspace !0)
; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0
; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir
index f513de8..477ef32 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir
@@ -385,117 +385,16 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
- ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64)
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2047
- ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1008
- ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND]], [[C2]]
- ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
- ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C3]](s32)
- ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4094
- ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C4]]
- ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 511
- ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV3]], [[C5]]
- ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[UV2]]
- ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR]](s32), [[C6]]
- ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
- ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[ZEXT]]
- ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 512
- ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR1]](s32), [[C6]]
- ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C7]], [[C6]]
- ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 31744
- ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SELECT]], [[C8]]
- ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
- ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ADD]], [[C9]](s32)
- ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL]]
- ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
- ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C10]], [[ADD]]
- ; CHECK-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[C6]]
- ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 13
- ; CHECK-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SMAX]], [[C11]]
- ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 4096
- ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[C12]]
- ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[OR4]], [[SMIN]](s32)
- ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR2]], [[SMIN]](s32)
- ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL1]](s32), [[OR4]]
- ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP2]](s1)
- ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[ZEXT1]]
- ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[ADD]](s32), [[C10]]
- ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[OR5]], [[OR3]]
- ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
- ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C13]]
- ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
- ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[SELECT1]], [[C14]](s32)
- ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
- ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND3]](s32), [[C15]]
- ; CHECK-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP4]](s1)
- ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
- ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND3]](s32), [[C16]]
- ; CHECK-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP5]](s1)
- ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[ZEXT3]]
- ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR3]], [[OR6]]
- ; CHECK-NEXT: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 30
- ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[ADD]](s32), [[C17]]
- ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP6]](s1), [[C8]], [[ADD1]]
- ; CHECK-NEXT: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 1039
- ; CHECK-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C18]]
- ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP7]](s1), [[OR2]], [[SELECT2]]
- ; CHECK-NEXT: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; CHECK-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C19]](s32)
- ; CHECK-NEXT: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 32768
- ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C20]]
- ; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SELECT3]]
- ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
- ; CHECK-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C]](s32)
- ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR5]], [[C1]]
- ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[AND5]], [[C2]]
- ; CHECK-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C3]](s32)
- ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C4]]
- ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C5]]
- ; CHECK-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[UV4]]
- ; CHECK-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR8]](s32), [[C6]]
- ; CHECK-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP8]](s1)
- ; CHECK-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[ZEXT4]]
- ; CHECK-NEXT: [[ICMP9:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR9]](s32), [[C6]]
- ; CHECK-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP9]](s1), [[C7]], [[C6]]
- ; CHECK-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SELECT4]], [[C8]]
- ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ADD2]], [[C9]](s32)
- ; CHECK-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL2]]
- ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C10]], [[ADD2]]
- ; CHECK-NEXT: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[C6]]
- ; CHECK-NEXT: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[C11]]
- ; CHECK-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[C12]]
- ; CHECK-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[OR12]], [[SMIN1]](s32)
- ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LSHR7]], [[SMIN1]](s32)
- ; CHECK-NEXT: [[ICMP10:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL3]](s32), [[OR12]]
- ; CHECK-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP10]](s1)
- ; CHECK-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[LSHR7]], [[ZEXT5]]
- ; CHECK-NEXT: [[ICMP11:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[ADD2]](s32), [[C10]]
- ; CHECK-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP11]](s1), [[OR13]], [[OR11]]
- ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[SELECT5]], [[C13]]
- ; CHECK-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[SELECT5]], [[C14]](s32)
- ; CHECK-NEXT: [[ICMP12:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND8]](s32), [[C15]]
- ; CHECK-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP12]](s1)
- ; CHECK-NEXT: [[ICMP13:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND8]](s32), [[C16]]
- ; CHECK-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP13]](s1)
- ; CHECK-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[ZEXT7]]
- ; CHECK-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[LSHR8]], [[OR14]]
- ; CHECK-NEXT: [[ICMP14:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[ADD2]](s32), [[C17]]
- ; CHECK-NEXT: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[ICMP14]](s1), [[C8]], [[ADD3]]
- ; CHECK-NEXT: [[ICMP15:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ADD2]](s32), [[C18]]
- ; CHECK-NEXT: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[ICMP15]](s1), [[OR10]], [[SELECT6]]
- ; CHECK-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C19]](s32)
- ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[LSHR9]], [[C20]]
- ; CHECK-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[AND9]], [[SELECT7]]
- ; CHECK-NEXT: [[C21:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
- ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[OR7]], [[C21]]
- ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[OR15]], [[C21]]
- ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C19]](s32)
- ; CHECK-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL4]]
- ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR16]](s32)
+ ; CHECK-NEXT: [[FPTRUNC:%[0-9]+]]:_(s32) = afn G_FPTRUNC [[UV]](s64)
+ ; CHECK-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = afn G_FPTRUNC [[FPTRUNC]](s32)
+ ; CHECK-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s32) = afn G_FPTRUNC [[UV1]](s64)
+ ; CHECK-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = afn G_FPTRUNC [[FPTRUNC2]](s32)
+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16)
+ ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC3]](s16)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
+ ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
+ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
; CHECK-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
%0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
%1:_(<2 x s16>) = afn G_FPTRUNC %0
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index d0b41e1..57b4857 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -1,16 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -global-isel=0 -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=SI-SDAG %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -global-isel=1 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI-GISEL %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=VI-SDAG %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -global-isel=1 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=VI-GISEL %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-SDAG %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-GISEL %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-SDAG %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-GISEL %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SDAG-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SDAG-FAKE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -global-isel=0 < %s | FileCheck -enable-var-scope -check-prefixes=SI-SDAG %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -global-isel=1 < %s | FileCheck -check-prefixes=SI-GISEL %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=VI-SDAG %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=VI-GISEL %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-SDAG %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-GISEL %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-SDAG %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-GISEL %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SDAG-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SDAG-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-FAKE16 %s
define amdgpu_kernel void @fptrunc_f32_to_f16(
; SI-SDAG-LABEL: fptrunc_f32_to_f16:
@@ -201,8 +201,8 @@ entry:
ret void
}
-define amdgpu_kernel void @fptrunc_f64_to_f16(
-; SI-SDAG-LABEL: fptrunc_f64_to_f16:
+define amdgpu_kernel void @fptrunc_f32_to_f16_afn(ptr addrspace(1) %r,
+; SI-SDAG-LABEL: fptrunc_f32_to_f16_afn:
; SI-SDAG: ; %bb.0: ; %entry
; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000
@@ -212,29 +212,27 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SI-SDAG-NEXT: s_mov_b32 s8, s2
; SI-SDAG-NEXT: s_mov_b32 s9, s3
-; SI-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SI-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
; SI-SDAG-NEXT: s_mov_b32 s4, s0
; SI-SDAG-NEXT: s_mov_b32 s5, s1
; SI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
; SI-SDAG-NEXT: s_endpgm
;
-; SI-GISEL-LABEL: fptrunc_f64_to_f16:
+; SI-GISEL-LABEL: fptrunc_f32_to_f16_afn:
; SI-GISEL: ; %bb.0: ; %entry
; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0
; SI-GISEL-NEXT: s_mov_b32 s2, -1
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s3
; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000
; SI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-GISEL-NEXT: s_endpgm
;
-; VI-SDAG-LABEL: fptrunc_f64_to_f16:
+; VI-SDAG-LABEL: fptrunc_f32_to_f16_afn:
; VI-SDAG: ; %bb.0: ; %entry
; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
@@ -244,29 +242,27 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: s_mov_b32 s8, s2
; VI-SDAG-NEXT: s_mov_b32 s9, s3
-; VI-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; VI-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
; VI-SDAG-NEXT: s_mov_b32 s4, s0
; VI-SDAG-NEXT: s_mov_b32 s5, s1
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-SDAG-NEXT: s_endpgm
;
-; VI-GISEL-LABEL: fptrunc_f64_to_f16:
+; VI-GISEL-LABEL: fptrunc_f32_to_f16_afn:
; VI-GISEL: ; %bb.0: ; %entry
; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
-; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-GISEL-NEXT: s_endpgm
;
-; GFX9-SDAG-LABEL: fptrunc_f64_to_f16:
+; GFX9-SDAG-LABEL: fptrunc_f32_to_f16_afn:
; GFX9-SDAG: ; %bb.0: ; %entry
; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
@@ -276,29 +272,27 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX9-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX9-SDAG-NEXT: s_endpgm
;
-; GFX9-GISEL-LABEL: fptrunc_f64_to_f16:
+; GFX9-GISEL-LABEL: fptrunc_f32_to_f16_afn:
; GFX9-GISEL: ; %bb.0: ; %entry
; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-GISEL-NEXT: s_endpgm
;
-; GFX950-SDAG-LABEL: fptrunc_f64_to_f16:
+; GFX950-SDAG-LABEL: fptrunc_f32_to_f16_afn:
; GFX950-SDAG: ; %bb.0: ; %entry
; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX950-SDAG-NEXT: s_mov_b32 s7, 0xf000
@@ -308,23 +302,541 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_mov_b32 s8, s2
; GFX950-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX950-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; GFX950-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
; GFX950-SDAG-NEXT: s_mov_b32 s4, s0
; GFX950-SDAG-NEXT: s_mov_b32 s5, s1
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX950-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX950-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX950-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX950-SDAG-NEXT: s_endpgm
;
+; GFX950-GISEL-LABEL: fptrunc_f32_to_f16_afn:
+; GFX950-GISEL: ; %bb.0: ; %entry
+; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX950-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
+; GFX950-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX950-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-TRUE16-LABEL: fptrunc_f32_to_f16_afn:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-SDAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: fptrunc_f32_to_f16_afn:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-SDAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-FAKE16-NEXT: s_endpgm
+;
+; GFX11-GISEL-TRUE16-LABEL: fptrunc_f32_to_f16_afn:
+; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, s2
+; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX11-GISEL-FAKE16-LABEL: fptrunc_f32_to_f16_afn:
+; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, s2
+; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-FAKE16-NEXT: s_endpgm
+ ptr addrspace(1) %a) {
+entry:
+ %a.val = load float, ptr addrspace(1) %a
+ %r.val = fptrunc afn float %a.val to half
+ store half %r.val, ptr addrspace(1) %r
+ ret void
+}
+
+define amdgpu_kernel void @fptrunc_f64_to_f16(
+; SI-SDAG-LABEL: fptrunc_f64_to_f16:
+; SI-SDAG: ; %bb.0: ; %entry
+; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT: s_mov_b32 s2, -1
+; SI-SDAG-NEXT: s_mov_b32 s10, s2
+; SI-SDAG-NEXT: s_mov_b32 s11, s3
+; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT: s_mov_b32 s8, s6
+; SI-SDAG-NEXT: s_mov_b32 s9, s7
+; SI-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SI-SDAG-NEXT: s_movk_i32 s0, 0x7e00
+; SI-SDAG-NEXT: s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT: v_readfirstlane_b32 s1, v1
+; SI-SDAG-NEXT: s_and_b32 s6, s1, 0x1ff
+; SI-SDAG-NEXT: s_lshr_b32 s7, s1, 8
+; SI-SDAG-NEXT: s_bfe_u32 s8, s1, 0xb0014
+; SI-SDAG-NEXT: v_or_b32_e32 v0, s6, v0
+; SI-SDAG-NEXT: s_and_b32 s6, s7, 0xffe
+; SI-SDAG-NEXT: s_sub_i32 s7, 0x3f1, s8
+; SI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-SDAG-NEXT: v_med3_i32 v1, s7, 0, 13
+; SI-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; SI-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; SI-SDAG-NEXT: s_or_b32 s6, s6, s7
+; SI-SDAG-NEXT: s_or_b32 s7, s6, 0x1000
+; SI-SDAG-NEXT: s_lshr_b32 s10, s7, s9
+; SI-SDAG-NEXT: s_lshl_b32 s9, s10, s9
+; SI-SDAG-NEXT: s_cmp_lg_u32 s9, s7
+; SI-SDAG-NEXT: s_cselect_b32 s7, 1, 0
+; SI-SDAG-NEXT: s_addk_i32 s8, 0xfc10
+; SI-SDAG-NEXT: s_or_b32 s7, s10, s7
+; SI-SDAG-NEXT: s_lshl_b32 s9, s8, 12
+; SI-SDAG-NEXT: s_or_b32 s9, s6, s9
+; SI-SDAG-NEXT: s_cmp_lt_i32 s8, 1
+; SI-SDAG-NEXT: s_cselect_b32 s7, s7, s9
+; SI-SDAG-NEXT: s_and_b32 s9, s7, 7
+; SI-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; SI-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; SI-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; SI-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; SI-SDAG-NEXT: s_lshr_b32 s7, s7, 2
+; SI-SDAG-NEXT: s_or_b32 s9, s9, s10
+; SI-SDAG-NEXT: s_add_i32 s7, s7, s9
+; SI-SDAG-NEXT: s_cmp_lt_i32 s8, 31
+; SI-SDAG-NEXT: s_cselect_b32 s7, s7, 0x7c00
+; SI-SDAG-NEXT: s_cmp_lg_u32 s6, 0
+; SI-SDAG-NEXT: s_cselect_b32 s0, s0, 0x7c00
+; SI-SDAG-NEXT: s_cmpk_eq_i32 s8, 0x40f
+; SI-SDAG-NEXT: s_cselect_b32 s0, s0, s7
+; SI-SDAG-NEXT: s_lshr_b32 s1, s1, 16
+; SI-SDAG-NEXT: s_and_b32 s1, s1, 0x8000
+; SI-SDAG-NEXT: s_or_b32 s6, s1, s0
+; SI-SDAG-NEXT: s_mov_b32 s0, s4
+; SI-SDAG-NEXT: s_mov_b32 s1, s5
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; SI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-SDAG-NEXT: s_endpgm
+;
+; SI-GISEL-LABEL: fptrunc_f64_to_f16:
+; SI-GISEL: ; %bb.0: ; %entry
+; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SI-GISEL-NEXT: s_mov_b32 s2, -1
+; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT: s_bfe_u32 s3, s5, 0xb0014
+; SI-GISEL-NEXT: s_lshr_b32 s6, s5, 8
+; SI-GISEL-NEXT: s_and_b32 s7, s5, 0x1ff
+; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10
+; SI-GISEL-NEXT: s_and_b32 s6, s6, 0xffe
+; SI-GISEL-NEXT: s_or_b32 s4, s7, s4
+; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
+; SI-GISEL-NEXT: s_or_b32 s4, s6, s4
+; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9
+; SI-GISEL-NEXT: s_lshl_b32 s7, s3, 12
+; SI-GISEL-NEXT: s_sub_i32 s8, 1, s3
+; SI-GISEL-NEXT: s_or_b32 s9, s4, 0x1000
+; SI-GISEL-NEXT: s_or_b32 s6, s6, 0x7c00
+; SI-GISEL-NEXT: s_or_b32 s4, s4, s7
+; SI-GISEL-NEXT: s_max_i32 s7, s8, 0
+; SI-GISEL-NEXT: s_min_i32 s7, s7, 13
+; SI-GISEL-NEXT: s_lshr_b32 s8, s9, s7
+; SI-GISEL-NEXT: s_lshl_b32 s7, s8, s7
+; SI-GISEL-NEXT: s_cmp_lg_u32 s7, s9
+; SI-GISEL-NEXT: s_cselect_b32 s7, 1, 0
+; SI-GISEL-NEXT: s_or_b32 s7, s8, s7
+; SI-GISEL-NEXT: s_cmp_lt_i32 s3, 1
+; SI-GISEL-NEXT: s_cselect_b32 s4, s7, s4
+; SI-GISEL-NEXT: s_and_b32 s7, s4, 7
+; SI-GISEL-NEXT: s_lshr_b32 s4, s4, 2
+; SI-GISEL-NEXT: s_cmp_eq_u32 s7, 3
+; SI-GISEL-NEXT: s_cselect_b32 s8, 1, 0
+; SI-GISEL-NEXT: s_cmp_gt_i32 s7, 5
+; SI-GISEL-NEXT: s_cselect_b32 s7, 1, 0
+; SI-GISEL-NEXT: s_or_b32 s7, s8, s7
+; SI-GISEL-NEXT: s_add_i32 s4, s4, s7
+; SI-GISEL-NEXT: s_cmp_gt_i32 s3, 30
+; SI-GISEL-NEXT: s_cselect_b32 s4, 0x7c00, s4
+; SI-GISEL-NEXT: s_cmpk_eq_i32 s3, 0x40f
+; SI-GISEL-NEXT: s_cselect_b32 s3, s6, s4
+; SI-GISEL-NEXT: s_lshr_b32 s4, s5, 16
+; SI-GISEL-NEXT: s_and_b32 s4, s4, 0x8000
+; SI-GISEL-NEXT: s_or_b32 s4, s4, s3
+; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; SI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-GISEL-NEXT: s_endpgm
+;
+; VI-SDAG-LABEL: fptrunc_f64_to_f16:
+; VI-SDAG: ; %bb.0: ; %entry
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
+; VI-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
+; VI-SDAG-NEXT: s_movk_i32 s6, 0x7e00
+; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT: v_readfirstlane_b32 s4, v1
+; VI-SDAG-NEXT: s_and_b32 s5, s4, 0x1ff
+; VI-SDAG-NEXT: v_or_b32_e32 v0, s5, v0
+; VI-SDAG-NEXT: s_lshr_b32 s7, s4, 8
+; VI-SDAG-NEXT: s_bfe_u32 s8, s4, 0xb0014
+; VI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; VI-SDAG-NEXT: s_and_b32 s5, s7, 0xffe
+; VI-SDAG-NEXT: s_sub_i32 s7, 0x3f1, s8
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-SDAG-NEXT: v_med3_i32 v1, s7, 0, 13
+; VI-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; VI-SDAG-NEXT: s_or_b32 s5, s5, s7
+; VI-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; VI-SDAG-NEXT: s_or_b32 s7, s5, 0x1000
+; VI-SDAG-NEXT: s_lshr_b32 s10, s7, s9
+; VI-SDAG-NEXT: s_lshl_b32 s9, s10, s9
+; VI-SDAG-NEXT: s_cmp_lg_u32 s9, s7
+; VI-SDAG-NEXT: s_cselect_b32 s7, 1, 0
+; VI-SDAG-NEXT: s_addk_i32 s8, 0xfc10
+; VI-SDAG-NEXT: s_lshl_b32 s9, s8, 12
+; VI-SDAG-NEXT: s_or_b32 s7, s10, s7
+; VI-SDAG-NEXT: s_or_b32 s9, s5, s9
+; VI-SDAG-NEXT: s_cmp_lt_i32 s8, 1
+; VI-SDAG-NEXT: s_cselect_b32 s7, s7, s9
+; VI-SDAG-NEXT: s_and_b32 s9, s7, 7
+; VI-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; VI-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; VI-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; VI-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; VI-SDAG-NEXT: s_lshr_b32 s7, s7, 2
+; VI-SDAG-NEXT: s_or_b32 s9, s9, s10
+; VI-SDAG-NEXT: s_add_i32 s7, s7, s9
+; VI-SDAG-NEXT: s_cmp_lt_i32 s8, 31
+; VI-SDAG-NEXT: s_cselect_b32 s7, s7, 0x7c00
+; VI-SDAG-NEXT: s_cmp_lg_u32 s5, 0
+; VI-SDAG-NEXT: s_cselect_b32 s5, s6, 0x7c00
+; VI-SDAG-NEXT: s_cmpk_eq_i32 s8, 0x40f
+; VI-SDAG-NEXT: s_cselect_b32 s5, s5, s7
+; VI-SDAG-NEXT: s_lshr_b32 s4, s4, 16
+; VI-SDAG-NEXT: s_and_b32 s4, s4, 0x8000
+; VI-SDAG-NEXT: s_or_b32 s4, s4, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-SDAG-NEXT: s_endpgm
+;
+; VI-GISEL-LABEL: fptrunc_f64_to_f16:
+; VI-GISEL: ; %bb.0: ; %entry
+; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
+; VI-GISEL-NEXT: s_lshr_b32 s5, s3, 8
+; VI-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
+; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10
+; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
+; VI-GISEL-NEXT: s_or_b32 s2, s6, s2
+; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; VI-GISEL-NEXT: s_or_b32 s2, s5, s2
+; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
+; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4
+; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12
+; VI-GISEL-NEXT: s_max_i32 s7, s7, 0
+; VI-GISEL-NEXT: s_or_b32 s6, s2, s6
+; VI-GISEL-NEXT: s_min_i32 s7, s7, 13
+; VI-GISEL-NEXT: s_bitset1_b32 s2, 12
+; VI-GISEL-NEXT: s_lshl_b32 s5, s5, 9
+; VI-GISEL-NEXT: s_lshr_b32 s8, s2, s7
+; VI-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
+; VI-GISEL-NEXT: s_lshl_b32 s7, s8, s7
+; VI-GISEL-NEXT: s_cmp_lg_u32 s7, s2
+; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; VI-GISEL-NEXT: s_or_b32 s2, s8, s2
+; VI-GISEL-NEXT: s_cmp_lt_i32 s4, 1
+; VI-GISEL-NEXT: s_cselect_b32 s2, s2, s6
+; VI-GISEL-NEXT: s_and_b32 s6, s2, 7
+; VI-GISEL-NEXT: s_lshr_b32 s2, s2, 2
+; VI-GISEL-NEXT: s_cmp_eq_u32 s6, 3
+; VI-GISEL-NEXT: s_cselect_b32 s7, 1, 0
+; VI-GISEL-NEXT: s_cmp_gt_i32 s6, 5
+; VI-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; VI-GISEL-NEXT: s_or_b32 s6, s7, s6
+; VI-GISEL-NEXT: s_add_i32 s2, s2, s6
+; VI-GISEL-NEXT: s_cmp_gt_i32 s4, 30
+; VI-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
+; VI-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
+; VI-GISEL-NEXT: s_cselect_b32 s2, s5, s2
+; VI-GISEL-NEXT: s_lshr_b32 s3, s3, 16
+; VI-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
+; VI-GISEL-NEXT: s_or_b32 s2, s3, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT: s_mov_b32 s2, -1
+; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-GISEL-NEXT: s_endpgm
+;
+; GFX9-SDAG-LABEL: fptrunc_f64_to_f16:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s7, s3
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: s_mov_b32 s4, s10
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s11
+; GFX9-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s8
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s9
+; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7e00
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-SDAG-NEXT: s_and_b32 s6, s5, 0x1ff
+; GFX9-SDAG-NEXT: v_or_b32_e32 v0, s6, v0
+; GFX9-SDAG-NEXT: s_lshr_b32 s7, s5, 8
+; GFX9-SDAG-NEXT: s_bfe_u32 s8, s5, 0xb0014
+; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-SDAG-NEXT: s_and_b32 s6, s7, 0xffe
+; GFX9-SDAG-NEXT: s_sub_i32 s7, 0x3f1, s8
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-SDAG-NEXT: v_med3_i32 v1, s7, 0, 13
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; GFX9-SDAG-NEXT: s_or_b32 s6, s6, s7
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; GFX9-SDAG-NEXT: s_or_b32 s7, s6, 0x1000
+; GFX9-SDAG-NEXT: s_lshr_b32 s10, s7, s9
+; GFX9-SDAG-NEXT: s_lshl_b32 s9, s10, s9
+; GFX9-SDAG-NEXT: s_cmp_lg_u32 s9, s7
+; GFX9-SDAG-NEXT: s_cselect_b32 s7, 1, 0
+; GFX9-SDAG-NEXT: s_addk_i32 s8, 0xfc10
+; GFX9-SDAG-NEXT: s_lshl_b32 s9, s8, 12
+; GFX9-SDAG-NEXT: s_or_b32 s7, s10, s7
+; GFX9-SDAG-NEXT: s_or_b32 s9, s6, s9
+; GFX9-SDAG-NEXT: s_cmp_lt_i32 s8, 1
+; GFX9-SDAG-NEXT: s_cselect_b32 s7, s7, s9
+; GFX9-SDAG-NEXT: s_and_b32 s9, s7, 7
+; GFX9-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; GFX9-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; GFX9-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; GFX9-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; GFX9-SDAG-NEXT: s_lshr_b32 s7, s7, 2
+; GFX9-SDAG-NEXT: s_or_b32 s9, s9, s10
+; GFX9-SDAG-NEXT: s_add_i32 s7, s7, s9
+; GFX9-SDAG-NEXT: s_cmp_lt_i32 s8, 31
+; GFX9-SDAG-NEXT: s_cselect_b32 s7, s7, 0x7c00
+; GFX9-SDAG-NEXT: s_cmp_lg_u32 s6, 0
+; GFX9-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00
+; GFX9-SDAG-NEXT: s_cmpk_eq_i32 s8, 0x40f
+; GFX9-SDAG-NEXT: s_cselect_b32 s4, s4, s7
+; GFX9-SDAG-NEXT: s_lshr_b32 s5, s5, 16
+; GFX9-SDAG-NEXT: s_and_b32 s5, s5, 0x8000
+; GFX9-SDAG-NEXT: s_or_b32 s4, s5, s4
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-SDAG-NEXT: s_endpgm
+;
+; GFX9-GISEL-LABEL: fptrunc_f64_to_f16:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
+; GFX9-GISEL-NEXT: s_lshr_b32 s5, s3, 8
+; GFX9-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX9-GISEL-NEXT: s_addk_i32 s4, 0xfc10
+; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX9-GISEL-NEXT: s_or_b32 s2, s6, s2
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX9-GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GFX9-GISEL-NEXT: s_or_b32 s2, s5, s2
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0
+; GFX9-GISEL-NEXT: s_sub_i32 s7, 1, s4
+; GFX9-GISEL-NEXT: s_lshl_b32 s6, s4, 12
+; GFX9-GISEL-NEXT: s_max_i32 s7, s7, 0
+; GFX9-GISEL-NEXT: s_or_b32 s6, s2, s6
+; GFX9-GISEL-NEXT: s_min_i32 s7, s7, 13
+; GFX9-GISEL-NEXT: s_bitset1_b32 s2, 12
+; GFX9-GISEL-NEXT: s_lshl_b32 s5, s5, 9
+; GFX9-GISEL-NEXT: s_lshr_b32 s8, s2, s7
+; GFX9-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
+; GFX9-GISEL-NEXT: s_lshl_b32 s7, s8, s7
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s7, s2
+; GFX9-GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GFX9-GISEL-NEXT: s_or_b32 s2, s8, s2
+; GFX9-GISEL-NEXT: s_cmp_lt_i32 s4, 1
+; GFX9-GISEL-NEXT: s_cselect_b32 s2, s2, s6
+; GFX9-GISEL-NEXT: s_and_b32 s6, s2, 7
+; GFX9-GISEL-NEXT: s_lshr_b32 s2, s2, 2
+; GFX9-GISEL-NEXT: s_cmp_eq_u32 s6, 3
+; GFX9-GISEL-NEXT: s_cselect_b32 s7, 1, 0
+; GFX9-GISEL-NEXT: s_cmp_gt_i32 s6, 5
+; GFX9-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; GFX9-GISEL-NEXT: s_or_b32 s6, s7, s6
+; GFX9-GISEL-NEXT: s_add_i32 s2, s2, s6
+; GFX9-GISEL-NEXT: s_cmp_gt_i32 s4, 30
+; GFX9-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
+; GFX9-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
+; GFX9-GISEL-NEXT: s_cselect_b32 s2, s5, s2
+; GFX9-GISEL-NEXT: s_lshr_b32 s3, s3, 16
+; GFX9-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
+; GFX9-GISEL-NEXT: s_or_b32 s2, s3, s2
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: fptrunc_f64_to_f16:
+; GFX950-SDAG: ; %bb.0: ; %entry
+; GFX950-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX950-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX950-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX950-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX950-SDAG-NEXT: s_mov_b32 s7, s3
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_mov_b32 s4, s10
+; GFX950-SDAG-NEXT: s_mov_b32 s5, s11
+; GFX950-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX950-SDAG-NEXT: s_mov_b32 s0, s8
+; GFX950-SDAG-NEXT: s_mov_b32 s1, s9
+; GFX950-SDAG-NEXT: s_movk_i32 s4, 0x7e00
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s5, v1
+; GFX950-SDAG-NEXT: s_and_b32 s6, s5, 0x1ff
+; GFX950-SDAG-NEXT: v_or_b32_e32 v0, s6, v0
+; GFX950-SDAG-NEXT: s_lshr_b32 s7, s5, 8
+; GFX950-SDAG-NEXT: s_bfe_u32 s8, s5, 0xb0014
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX950-SDAG-NEXT: s_and_b32 s6, s7, 0xffe
+; GFX950-SDAG-NEXT: s_sub_i32 s7, 0x3f1, s8
+; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-SDAG-NEXT: v_med3_i32 v1, s7, 0, 13
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; GFX950-SDAG-NEXT: s_or_b32 s6, s6, s7
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; GFX950-SDAG-NEXT: s_or_b32 s7, s6, 0x1000
+; GFX950-SDAG-NEXT: s_lshr_b32 s10, s7, s9
+; GFX950-SDAG-NEXT: s_lshl_b32 s9, s10, s9
+; GFX950-SDAG-NEXT: s_cmp_lg_u32 s9, s7
+; GFX950-SDAG-NEXT: s_cselect_b32 s7, 1, 0
+; GFX950-SDAG-NEXT: s_addk_i32 s8, 0xfc10
+; GFX950-SDAG-NEXT: s_lshl_b32 s9, s8, 12
+; GFX950-SDAG-NEXT: s_or_b32 s7, s10, s7
+; GFX950-SDAG-NEXT: s_or_b32 s9, s6, s9
+; GFX950-SDAG-NEXT: s_cmp_lt_i32 s8, 1
+; GFX950-SDAG-NEXT: s_cselect_b32 s7, s7, s9
+; GFX950-SDAG-NEXT: s_and_b32 s9, s7, 7
+; GFX950-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; GFX950-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; GFX950-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; GFX950-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; GFX950-SDAG-NEXT: s_lshr_b32 s7, s7, 2
+; GFX950-SDAG-NEXT: s_or_b32 s9, s9, s10
+; GFX950-SDAG-NEXT: s_add_i32 s7, s7, s9
+; GFX950-SDAG-NEXT: s_cmp_lt_i32 s8, 31
+; GFX950-SDAG-NEXT: s_cselect_b32 s7, s7, 0x7c00
+; GFX950-SDAG-NEXT: s_cmp_lg_u32 s6, 0
+; GFX950-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00
+; GFX950-SDAG-NEXT: s_cmpk_eq_i32 s8, 0x40f
+; GFX950-SDAG-NEXT: s_cselect_b32 s4, s4, s7
+; GFX950-SDAG-NEXT: s_lshr_b32 s5, s5, 16
+; GFX950-SDAG-NEXT: s_and_b32 s5, s5, 0x8000
+; GFX950-SDAG-NEXT: s_or_b32 s4, s5, s4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX950-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX950-SDAG-NEXT: s_endpgm
+;
; GFX950-GISEL-LABEL: fptrunc_f64_to_f16:
; GFX950-GISEL: ; %bb.0: ; %entry
; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX950-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
+; GFX950-GISEL-NEXT: s_lshr_b32 s5, s3, 8
+; GFX950-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX950-GISEL-NEXT: s_addk_i32 s4, 0xfc10
+; GFX950-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX950-GISEL-NEXT: s_or_b32 s2, s6, s2
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX950-GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GFX950-GISEL-NEXT: s_or_b32 s2, s5, s2
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0
+; GFX950-GISEL-NEXT: s_sub_i32 s7, 1, s4
+; GFX950-GISEL-NEXT: s_lshl_b32 s6, s4, 12
+; GFX950-GISEL-NEXT: s_max_i32 s7, s7, 0
+; GFX950-GISEL-NEXT: s_or_b32 s6, s2, s6
+; GFX950-GISEL-NEXT: s_min_i32 s7, s7, 13
+; GFX950-GISEL-NEXT: s_bitset1_b32 s2, 12
+; GFX950-GISEL-NEXT: s_lshl_b32 s5, s5, 9
+; GFX950-GISEL-NEXT: s_lshr_b32 s8, s2, s7
+; GFX950-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
+; GFX950-GISEL-NEXT: s_lshl_b32 s7, s8, s7
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s7, s2
+; GFX950-GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GFX950-GISEL-NEXT: s_or_b32 s2, s8, s2
+; GFX950-GISEL-NEXT: s_cmp_lt_i32 s4, 1
+; GFX950-GISEL-NEXT: s_cselect_b32 s2, s2, s6
+; GFX950-GISEL-NEXT: s_and_b32 s6, s2, 7
+; GFX950-GISEL-NEXT: s_lshr_b32 s2, s2, 2
+; GFX950-GISEL-NEXT: s_cmp_eq_u32 s6, 3
+; GFX950-GISEL-NEXT: s_cselect_b32 s7, 1, 0
+; GFX950-GISEL-NEXT: s_cmp_gt_i32 s6, 5
+; GFX950-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; GFX950-GISEL-NEXT: s_or_b32 s6, s7, s6
+; GFX950-GISEL-NEXT: s_add_i32 s2, s2, s6
+; GFX950-GISEL-NEXT: s_cmp_gt_i32 s4, 30
+; GFX950-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
+; GFX950-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
+; GFX950-GISEL-NEXT: s_cselect_b32 s2, s5, s2
+; GFX950-GISEL-NEXT: s_lshr_b32 s3, s3, 16
+; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
+; GFX950-GISEL-NEXT: s_or_b32 s2, s3, s2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX950-GISEL-NEXT: s_mov_b32 s2, -1
; GFX950-GISEL-NEXT: s_mov_b32 s3, 0xf000
; GFX950-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -340,13 +852,60 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2
; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0
; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
-; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s3, s2, 0x1ff
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s5, s2, 8
+; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, s3, v0
+; GFX11-SDAG-TRUE16-NEXT: s_bfe_u32 s3, s2, 0xb0014
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-SDAG-TRUE16-NEXT: s_sub_i32 s4, 0x3f1, s3
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v1, s4, 0, 13
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s8, v1
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, s4, 0x1000
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s9, s5, s8
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s8, s9, s8
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s8, s5
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_addk_i32 s3, 0xfc10
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, s9, s5
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s8, s3, 12
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s8, s4, s8
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s3, 1
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, s5, s8
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s8, s5, 7
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_gt_i32 s8, 5
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s9, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_eq_u32 s8, 3
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s8, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s5, s5, 2
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_add_i32 s5, s5, s8
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s3, 31
+; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s8, 0x7e00
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s4, s8, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT: s_cmpk_eq_i32 s3, 0x40f
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s4, s5
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s2, s2, 0x8000
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, s2
; GFX11-SDAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-SDAG-TRUE16-NEXT: s_endpgm
;
@@ -360,13 +919,60 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s8, s2
; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0
; GFX11-SDAG-FAKE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
-; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s3, s2, 0x1ff
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s5, s2, 8
+; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, s3, v0
+; GFX11-SDAG-FAKE16-NEXT: s_bfe_u32 s3, s2, 0xb0014
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-SDAG-FAKE16-NEXT: s_sub_i32 s4, 0x3f1, s3
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v1, s4, 0, 13
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s8, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s5, s4, 0x1000
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s9, s5, s8
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s8, s9, s8
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s8, s5
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_addk_i32 s3, 0xfc10
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s5, s9, s5
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s8, s3, 12
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s8, s4, s8
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s3, 1
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, s5, s8
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s8, s5, 7
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_gt_i32 s8, 5
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s9, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_eq_u32 s8, 3
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s8, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s5, s5, 2
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_add_i32 s5, s5, s8
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s3, 31
+; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s8, 0x7e00
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s4, s8, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT: s_cmpk_eq_i32 s3, 0x40f
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s4, s5
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s2, s2, 0x8000
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, s2
; GFX11-SDAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-SDAG-FAKE16-NEXT: s_endpgm
;
@@ -376,6 +982,555 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
+; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s3, 8
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2
+; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s5, s2
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s8, s2, 0x1000
+; GFX11-GISEL-TRUE16-NEXT: s_max_i32 s6, s6, 0
+; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s7, s4, 12
+; GFX11-GISEL-TRUE16-NEXT: s_min_i32 s6, s6, 13
+; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s5, s5, 9
+; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s9, s8, s6
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s2, s7
+; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s6, s9, s6
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s5, s5, 0x7c00
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, s8
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s9, s6
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lt_i32 s4, 1
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s6, s2
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s2, 7
+; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s2, s2, 2
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_eq_u32 s6, 3
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s7, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_gt_i32 s6, 5
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s7, s6
+; GFX11-GISEL-TRUE16-NEXT: s_add_i32 s2, s2, s6
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_gt_i32 s4, 30
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 0x7c00, s2
+; GFX11-GISEL-TRUE16-NEXT: s_cmpk_eq_i32 s4, 0x40f
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s5, s2
+; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s3, 16
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2
+; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX11-GISEL-FAKE16-LABEL: fptrunc_f64_to_f16:
+; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
+; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s3, 8
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2
+; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s5, s2
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s8, s2, 0x1000
+; GFX11-GISEL-FAKE16-NEXT: s_max_i32 s6, s6, 0
+; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s7, s4, 12
+; GFX11-GISEL-FAKE16-NEXT: s_min_i32 s6, s6, 13
+; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s5, s5, 9
+; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s9, s8, s6
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s2, s7
+; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s6, s9, s6
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s5, s5, 0x7c00
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, s8
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s9, s6
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lt_i32 s4, 1
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s6, s2
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s2, 7
+; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s2, s2, 2
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_eq_u32 s6, 3
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s7, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_gt_i32 s6, 5
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s7, s6
+; GFX11-GISEL-FAKE16-NEXT: s_add_i32 s2, s2, s6
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_gt_i32 s4, 30
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 0x7c00, s2
+; GFX11-GISEL-FAKE16-NEXT: s_cmpk_eq_i32 s4, 0x40f
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s5, s2
+; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s3, 16
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2
+; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-FAKE16-NEXT: s_endpgm
+ ptr addrspace(1) %r,
+ ptr addrspace(1) %a) {
+entry:
+ %a.val = load double, ptr addrspace(1) %a
+ %r.val = fptrunc double %a.val to half
+ store half %r.val, ptr addrspace(1) %r
+ ret void
+}
+
+define amdgpu_kernel void @fptrunc_f64_to_f16_afn(
+; SI-SDAG-LABEL: fptrunc_f64_to_f16_afn:
+; SI-SDAG: ; %bb.0: ; %entry
+; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT: s_mov_b32 s2, -1
+; SI-SDAG-NEXT: s_mov_b32 s10, s2
+; SI-SDAG-NEXT: s_mov_b32 s11, s3
+; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT: s_mov_b32 s8, s6
+; SI-SDAG-NEXT: s_mov_b32 s9, s7
+; SI-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SI-SDAG-NEXT: s_movk_i32 s0, 0x7e00
+; SI-SDAG-NEXT: s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT: v_readfirstlane_b32 s1, v1
+; SI-SDAG-NEXT: s_and_b32 s6, s1, 0x1ff
+; SI-SDAG-NEXT: s_lshr_b32 s7, s1, 8
+; SI-SDAG-NEXT: s_bfe_u32 s8, s1, 0xb0014
+; SI-SDAG-NEXT: v_or_b32_e32 v0, s6, v0
+; SI-SDAG-NEXT: s_and_b32 s6, s7, 0xffe
+; SI-SDAG-NEXT: s_sub_i32 s7, 0x3f1, s8
+; SI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-SDAG-NEXT: v_med3_i32 v1, s7, 0, 13
+; SI-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; SI-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; SI-SDAG-NEXT: s_or_b32 s6, s6, s7
+; SI-SDAG-NEXT: s_or_b32 s7, s6, 0x1000
+; SI-SDAG-NEXT: s_lshr_b32 s10, s7, s9
+; SI-SDAG-NEXT: s_lshl_b32 s9, s10, s9
+; SI-SDAG-NEXT: s_cmp_lg_u32 s9, s7
+; SI-SDAG-NEXT: s_cselect_b32 s7, 1, 0
+; SI-SDAG-NEXT: s_addk_i32 s8, 0xfc10
+; SI-SDAG-NEXT: s_or_b32 s7, s10, s7
+; SI-SDAG-NEXT: s_lshl_b32 s9, s8, 12
+; SI-SDAG-NEXT: s_or_b32 s9, s6, s9
+; SI-SDAG-NEXT: s_cmp_lt_i32 s8, 1
+; SI-SDAG-NEXT: s_cselect_b32 s7, s7, s9
+; SI-SDAG-NEXT: s_and_b32 s9, s7, 7
+; SI-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; SI-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; SI-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; SI-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; SI-SDAG-NEXT: s_lshr_b32 s7, s7, 2
+; SI-SDAG-NEXT: s_or_b32 s9, s9, s10
+; SI-SDAG-NEXT: s_add_i32 s7, s7, s9
+; SI-SDAG-NEXT: s_cmp_lt_i32 s8, 31
+; SI-SDAG-NEXT: s_cselect_b32 s7, s7, 0x7c00
+; SI-SDAG-NEXT: s_cmp_lg_u32 s6, 0
+; SI-SDAG-NEXT: s_cselect_b32 s0, s0, 0x7c00
+; SI-SDAG-NEXT: s_cmpk_eq_i32 s8, 0x40f
+; SI-SDAG-NEXT: s_cselect_b32 s0, s0, s7
+; SI-SDAG-NEXT: s_lshr_b32 s1, s1, 16
+; SI-SDAG-NEXT: s_and_b32 s1, s1, 0x8000
+; SI-SDAG-NEXT: s_or_b32 s6, s1, s0
+; SI-SDAG-NEXT: s_mov_b32 s0, s4
+; SI-SDAG-NEXT: s_mov_b32 s1, s5
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; SI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-SDAG-NEXT: s_endpgm
+;
+; SI-GISEL-LABEL: fptrunc_f64_to_f16_afn:
+; SI-GISEL: ; %bb.0: ; %entry
+; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SI-GISEL-NEXT: s_mov_b32 s2, -1
+; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-GISEL-NEXT: s_endpgm
+;
+; VI-SDAG-LABEL: fptrunc_f64_to_f16_afn:
+; VI-SDAG: ; %bb.0: ; %entry
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
+; VI-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
+; VI-SDAG-NEXT: s_movk_i32 s6, 0x7e00
+; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT: v_readfirstlane_b32 s4, v1
+; VI-SDAG-NEXT: s_and_b32 s5, s4, 0x1ff
+; VI-SDAG-NEXT: v_or_b32_e32 v0, s5, v0
+; VI-SDAG-NEXT: s_lshr_b32 s7, s4, 8
+; VI-SDAG-NEXT: s_bfe_u32 s8, s4, 0xb0014
+; VI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; VI-SDAG-NEXT: s_and_b32 s5, s7, 0xffe
+; VI-SDAG-NEXT: s_sub_i32 s7, 0x3f1, s8
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-SDAG-NEXT: v_med3_i32 v1, s7, 0, 13
+; VI-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; VI-SDAG-NEXT: s_or_b32 s5, s5, s7
+; VI-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; VI-SDAG-NEXT: s_or_b32 s7, s5, 0x1000
+; VI-SDAG-NEXT: s_lshr_b32 s10, s7, s9
+; VI-SDAG-NEXT: s_lshl_b32 s9, s10, s9
+; VI-SDAG-NEXT: s_cmp_lg_u32 s9, s7
+; VI-SDAG-NEXT: s_cselect_b32 s7, 1, 0
+; VI-SDAG-NEXT: s_addk_i32 s8, 0xfc10
+; VI-SDAG-NEXT: s_lshl_b32 s9, s8, 12
+; VI-SDAG-NEXT: s_or_b32 s7, s10, s7
+; VI-SDAG-NEXT: s_or_b32 s9, s5, s9
+; VI-SDAG-NEXT: s_cmp_lt_i32 s8, 1
+; VI-SDAG-NEXT: s_cselect_b32 s7, s7, s9
+; VI-SDAG-NEXT: s_and_b32 s9, s7, 7
+; VI-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; VI-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; VI-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; VI-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; VI-SDAG-NEXT: s_lshr_b32 s7, s7, 2
+; VI-SDAG-NEXT: s_or_b32 s9, s9, s10
+; VI-SDAG-NEXT: s_add_i32 s7, s7, s9
+; VI-SDAG-NEXT: s_cmp_lt_i32 s8, 31
+; VI-SDAG-NEXT: s_cselect_b32 s7, s7, 0x7c00
+; VI-SDAG-NEXT: s_cmp_lg_u32 s5, 0
+; VI-SDAG-NEXT: s_cselect_b32 s5, s6, 0x7c00
+; VI-SDAG-NEXT: s_cmpk_eq_i32 s8, 0x40f
+; VI-SDAG-NEXT: s_cselect_b32 s5, s5, s7
+; VI-SDAG-NEXT: s_lshr_b32 s4, s4, 16
+; VI-SDAG-NEXT: s_and_b32 s4, s4, 0x8000
+; VI-SDAG-NEXT: s_or_b32 s4, s4, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-SDAG-NEXT: s_endpgm
+;
+; VI-GISEL-LABEL: fptrunc_f64_to_f16_afn:
+; VI-GISEL: ; %bb.0: ; %entry
+; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; VI-GISEL-NEXT: s_mov_b32 s2, -1
+; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-GISEL-NEXT: s_endpgm
+;
+; GFX9-SDAG-LABEL: fptrunc_f64_to_f16_afn:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s7, s3
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: s_mov_b32 s4, s10
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s11
+; GFX9-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s8
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s9
+; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7e00
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-SDAG-NEXT: s_and_b32 s6, s5, 0x1ff
+; GFX9-SDAG-NEXT: v_or_b32_e32 v0, s6, v0
+; GFX9-SDAG-NEXT: s_lshr_b32 s7, s5, 8
+; GFX9-SDAG-NEXT: s_bfe_u32 s8, s5, 0xb0014
+; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-SDAG-NEXT: s_and_b32 s6, s7, 0xffe
+; GFX9-SDAG-NEXT: s_sub_i32 s7, 0x3f1, s8
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-SDAG-NEXT: v_med3_i32 v1, s7, 0, 13
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; GFX9-SDAG-NEXT: s_or_b32 s6, s6, s7
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; GFX9-SDAG-NEXT: s_or_b32 s7, s6, 0x1000
+; GFX9-SDAG-NEXT: s_lshr_b32 s10, s7, s9
+; GFX9-SDAG-NEXT: s_lshl_b32 s9, s10, s9
+; GFX9-SDAG-NEXT: s_cmp_lg_u32 s9, s7
+; GFX9-SDAG-NEXT: s_cselect_b32 s7, 1, 0
+; GFX9-SDAG-NEXT: s_addk_i32 s8, 0xfc10
+; GFX9-SDAG-NEXT: s_lshl_b32 s9, s8, 12
+; GFX9-SDAG-NEXT: s_or_b32 s7, s10, s7
+; GFX9-SDAG-NEXT: s_or_b32 s9, s6, s9
+; GFX9-SDAG-NEXT: s_cmp_lt_i32 s8, 1
+; GFX9-SDAG-NEXT: s_cselect_b32 s7, s7, s9
+; GFX9-SDAG-NEXT: s_and_b32 s9, s7, 7
+; GFX9-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; GFX9-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; GFX9-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; GFX9-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; GFX9-SDAG-NEXT: s_lshr_b32 s7, s7, 2
+; GFX9-SDAG-NEXT: s_or_b32 s9, s9, s10
+; GFX9-SDAG-NEXT: s_add_i32 s7, s7, s9
+; GFX9-SDAG-NEXT: s_cmp_lt_i32 s8, 31
+; GFX9-SDAG-NEXT: s_cselect_b32 s7, s7, 0x7c00
+; GFX9-SDAG-NEXT: s_cmp_lg_u32 s6, 0
+; GFX9-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00
+; GFX9-SDAG-NEXT: s_cmpk_eq_i32 s8, 0x40f
+; GFX9-SDAG-NEXT: s_cselect_b32 s4, s4, s7
+; GFX9-SDAG-NEXT: s_lshr_b32 s5, s5, 16
+; GFX9-SDAG-NEXT: s_and_b32 s5, s5, 0x8000
+; GFX9-SDAG-NEXT: s_or_b32 s4, s5, s4
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-SDAG-NEXT: s_endpgm
+;
+; GFX9-GISEL-LABEL: fptrunc_f64_to_f16_afn:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: fptrunc_f64_to_f16_afn:
+; GFX950-SDAG: ; %bb.0: ; %entry
+; GFX950-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX950-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX950-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX950-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX950-SDAG-NEXT: s_mov_b32 s7, s3
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_mov_b32 s4, s10
+; GFX950-SDAG-NEXT: s_mov_b32 s5, s11
+; GFX950-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX950-SDAG-NEXT: s_mov_b32 s0, s8
+; GFX950-SDAG-NEXT: s_mov_b32 s1, s9
+; GFX950-SDAG-NEXT: s_movk_i32 s4, 0x7e00
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s5, v1
+; GFX950-SDAG-NEXT: s_and_b32 s6, s5, 0x1ff
+; GFX950-SDAG-NEXT: v_or_b32_e32 v0, s6, v0
+; GFX950-SDAG-NEXT: s_lshr_b32 s7, s5, 8
+; GFX950-SDAG-NEXT: s_bfe_u32 s8, s5, 0xb0014
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX950-SDAG-NEXT: s_and_b32 s6, s7, 0xffe
+; GFX950-SDAG-NEXT: s_sub_i32 s7, 0x3f1, s8
+; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-SDAG-NEXT: v_med3_i32 v1, s7, 0, 13
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; GFX950-SDAG-NEXT: s_or_b32 s6, s6, s7
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; GFX950-SDAG-NEXT: s_or_b32 s7, s6, 0x1000
+; GFX950-SDAG-NEXT: s_lshr_b32 s10, s7, s9
+; GFX950-SDAG-NEXT: s_lshl_b32 s9, s10, s9
+; GFX950-SDAG-NEXT: s_cmp_lg_u32 s9, s7
+; GFX950-SDAG-NEXT: s_cselect_b32 s7, 1, 0
+; GFX950-SDAG-NEXT: s_addk_i32 s8, 0xfc10
+; GFX950-SDAG-NEXT: s_lshl_b32 s9, s8, 12
+; GFX950-SDAG-NEXT: s_or_b32 s7, s10, s7
+; GFX950-SDAG-NEXT: s_or_b32 s9, s6, s9
+; GFX950-SDAG-NEXT: s_cmp_lt_i32 s8, 1
+; GFX950-SDAG-NEXT: s_cselect_b32 s7, s7, s9
+; GFX950-SDAG-NEXT: s_and_b32 s9, s7, 7
+; GFX950-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; GFX950-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; GFX950-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; GFX950-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; GFX950-SDAG-NEXT: s_lshr_b32 s7, s7, 2
+; GFX950-SDAG-NEXT: s_or_b32 s9, s9, s10
+; GFX950-SDAG-NEXT: s_add_i32 s7, s7, s9
+; GFX950-SDAG-NEXT: s_cmp_lt_i32 s8, 31
+; GFX950-SDAG-NEXT: s_cselect_b32 s7, s7, 0x7c00
+; GFX950-SDAG-NEXT: s_cmp_lg_u32 s6, 0
+; GFX950-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00
+; GFX950-SDAG-NEXT: s_cmpk_eq_i32 s8, 0x40f
+; GFX950-SDAG-NEXT: s_cselect_b32 s4, s4, s7
+; GFX950-SDAG-NEXT: s_lshr_b32 s5, s5, 16
+; GFX950-SDAG-NEXT: s_and_b32 s5, s5, 0x8000
+; GFX950-SDAG-NEXT: s_or_b32 s4, s5, s4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX950-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: fptrunc_f64_to_f16_afn:
+; GFX950-GISEL: ; %bb.0: ; %entry
+; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX950-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX950-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX950-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-TRUE16-LABEL: fptrunc_f64_to_f16_afn:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s3, s2, 0x1ff
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s5, s2, 8
+; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, s3, v0
+; GFX11-SDAG-TRUE16-NEXT: s_bfe_u32 s3, s2, 0xb0014
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-SDAG-TRUE16-NEXT: s_sub_i32 s4, 0x3f1, s3
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v1, s4, 0, 13
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s8, v1
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, s4, 0x1000
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s9, s5, s8
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s8, s9, s8
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s8, s5
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_addk_i32 s3, 0xfc10
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, s9, s5
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s8, s3, 12
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s8, s4, s8
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s3, 1
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, s5, s8
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s8, s5, 7
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_gt_i32 s8, 5
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s9, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_eq_u32 s8, 3
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s8, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s5, s5, 2
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_add_i32 s5, s5, s8
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s3, 31
+; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s8, 0x7e00
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s4, s8, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT: s_cmpk_eq_i32 s3, 0x40f
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s4, s5
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s2, s2, 0x8000
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SDAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: fptrunc_f64_to_f16_afn:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-SDAG-FAKE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s3, s2, 0x1ff
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s5, s2, 8
+; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, s3, v0
+; GFX11-SDAG-FAKE16-NEXT: s_bfe_u32 s3, s2, 0xb0014
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-SDAG-FAKE16-NEXT: s_sub_i32 s4, 0x3f1, s3
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v1, s4, 0, 13
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s8, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s5, s4, 0x1000
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s9, s5, s8
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s8, s9, s8
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s8, s5
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_addk_i32 s3, 0xfc10
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s5, s9, s5
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s8, s3, 12
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s8, s4, s8
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s3, 1
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, s5, s8
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s8, s5, 7
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_gt_i32 s8, 5
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s9, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_eq_u32 s8, 3
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s8, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s5, s5, 2
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_add_i32 s5, s5, s8
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s3, 31
+; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s8, 0x7e00
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s4, s8, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT: s_cmpk_eq_i32 s3, 0x40f
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s4, s5
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s2, s2, 0x8000
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SDAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-FAKE16-NEXT: s_endpgm
+;
+; GFX11-GISEL-TRUE16-LABEL: fptrunc_f64_to_f16_afn:
+; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
@@ -384,7 +1539,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX11-GISEL-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-GISEL-TRUE16-NEXT: s_endpgm
;
-; GFX11-GISEL-FAKE16-LABEL: fptrunc_f64_to_f16:
+; GFX11-GISEL-FAKE16-LABEL: fptrunc_f64_to_f16_afn:
; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry
; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
@@ -401,7 +1556,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
ptr addrspace(1) %a) {
entry:
%a.val = load double, ptr addrspace(1) %a
- %r.val = fptrunc double %a.val to half
+ %r.val = fptrunc afn double %a.val to half
store half %r.val, ptr addrspace(1) %r
ret void
}
@@ -626,25 +1781,106 @@ entry:
define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; SI-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
; SI-SDAG: ; %bb.0: ; %entry
-; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; SI-SDAG-NEXT: s_mov_b32 s6, -1
-; SI-SDAG-NEXT: s_mov_b32 s10, s6
-; SI-SDAG-NEXT: s_mov_b32 s11, s7
+; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT: s_mov_b32 s2, -1
+; SI-SDAG-NEXT: s_mov_b32 s10, s2
+; SI-SDAG-NEXT: s_mov_b32 s11, s3
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT: s_mov_b32 s8, s2
-; SI-SDAG-NEXT: s_mov_b32 s9, s3
+; SI-SDAG-NEXT: s_mov_b32 s8, s6
+; SI-SDAG-NEXT: s_mov_b32 s9, s7
; SI-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; SI-SDAG-NEXT: s_mov_b32 s4, s0
-; SI-SDAG-NEXT: s_mov_b32 s5, s1
+; SI-SDAG-NEXT: s_movk_i32 s0, 0x7e00
; SI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
-; SI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v2
-; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; SI-SDAG-NEXT: v_readfirstlane_b32 s6, v1
+; SI-SDAG-NEXT: s_and_b32 s7, s1, 0x1ff
+; SI-SDAG-NEXT: s_lshr_b32 s8, s1, 8
+; SI-SDAG-NEXT: s_bfe_u32 s9, s1, 0xb0014
+; SI-SDAG-NEXT: v_or_b32_e32 v1, s7, v2
+; SI-SDAG-NEXT: s_and_b32 s7, s8, 0xffe
+; SI-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s9
+; SI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-SDAG-NEXT: v_med3_i32 v2, s8, 0, 13
+; SI-SDAG-NEXT: v_readfirstlane_b32 s8, v1
+; SI-SDAG-NEXT: v_readfirstlane_b32 s10, v2
+; SI-SDAG-NEXT: s_or_b32 s7, s7, s8
+; SI-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; SI-SDAG-NEXT: s_lshr_b32 s11, s8, s10
+; SI-SDAG-NEXT: s_lshl_b32 s10, s11, s10
+; SI-SDAG-NEXT: s_cmp_lg_u32 s10, s8
+; SI-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; SI-SDAG-NEXT: s_addk_i32 s9, 0xfc10
+; SI-SDAG-NEXT: s_or_b32 s8, s11, s8
+; SI-SDAG-NEXT: s_lshl_b32 s10, s9, 12
+; SI-SDAG-NEXT: s_or_b32 s10, s7, s10
+; SI-SDAG-NEXT: s_cmp_lt_i32 s9, 1
+; SI-SDAG-NEXT: s_cselect_b32 s8, s8, s10
+; SI-SDAG-NEXT: s_and_b32 s10, s8, 7
+; SI-SDAG-NEXT: s_cmp_gt_i32 s10, 5
+; SI-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; SI-SDAG-NEXT: s_cmp_eq_u32 s10, 3
+; SI-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; SI-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; SI-SDAG-NEXT: s_or_b32 s10, s10, s11
+; SI-SDAG-NEXT: s_add_i32 s8, s8, s10
+; SI-SDAG-NEXT: s_cmp_lt_i32 s9, 31
+; SI-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; SI-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; SI-SDAG-NEXT: s_cselect_b32 s7, s0, 0x7c00
+; SI-SDAG-NEXT: s_cmpk_eq_i32 s9, 0x40f
+; SI-SDAG-NEXT: s_cselect_b32 s7, s7, s8
+; SI-SDAG-NEXT: s_lshr_b32 s1, s1, 16
+; SI-SDAG-NEXT: s_and_b32 s8, s6, 0x1ff
+; SI-SDAG-NEXT: s_lshr_b32 s9, s6, 8
+; SI-SDAG-NEXT: s_bfe_u32 s10, s6, 0xb0014
+; SI-SDAG-NEXT: s_and_b32 s1, s1, 0x8000
+; SI-SDAG-NEXT: v_or_b32_e32 v0, s8, v0
+; SI-SDAG-NEXT: s_and_b32 s8, s9, 0xffe
+; SI-SDAG-NEXT: s_sub_i32 s9, 0x3f1, s10
+; SI-SDAG-NEXT: s_or_b32 s1, s1, s7
+; SI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-SDAG-NEXT: v_med3_i32 v1, s9, 0, 13
+; SI-SDAG-NEXT: s_lshl_b32 s1, s1, 16
+; SI-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; SI-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; SI-SDAG-NEXT: s_or_b32 s7, s8, s7
+; SI-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; SI-SDAG-NEXT: s_lshr_b32 s11, s8, s9
+; SI-SDAG-NEXT: s_lshl_b32 s9, s11, s9
+; SI-SDAG-NEXT: s_cmp_lg_u32 s9, s8
+; SI-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; SI-SDAG-NEXT: s_addk_i32 s10, 0xfc10
+; SI-SDAG-NEXT: s_or_b32 s8, s11, s8
+; SI-SDAG-NEXT: s_lshl_b32 s9, s10, 12
+; SI-SDAG-NEXT: s_or_b32 s9, s7, s9
+; SI-SDAG-NEXT: s_cmp_lt_i32 s10, 1
+; SI-SDAG-NEXT: s_cselect_b32 s8, s8, s9
+; SI-SDAG-NEXT: s_and_b32 s9, s8, 7
+; SI-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; SI-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; SI-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; SI-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; SI-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; SI-SDAG-NEXT: s_or_b32 s9, s9, s11
+; SI-SDAG-NEXT: s_add_i32 s8, s8, s9
+; SI-SDAG-NEXT: s_cmp_lt_i32 s10, 31
+; SI-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; SI-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; SI-SDAG-NEXT: s_cselect_b32 s0, s0, 0x7c00
+; SI-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f
+; SI-SDAG-NEXT: s_cselect_b32 s0, s0, s8
+; SI-SDAG-NEXT: s_lshr_b32 s6, s6, 16
+; SI-SDAG-NEXT: s_and_b32 s6, s6, 0x8000
+; SI-SDAG-NEXT: s_or_b32 s0, s6, s0
+; SI-SDAG-NEXT: s_and_b32 s0, s0, 0xffff
+; SI-SDAG-NEXT: s_or_b32 s6, s0, s1
+; SI-SDAG-NEXT: s_mov_b32 s0, s4
+; SI-SDAG-NEXT: s_mov_b32 s1, s5
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; SI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-SDAG-NEXT: s_endpgm
;
; SI-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
@@ -654,6 +1890,1251 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; SI-GISEL-NEXT: s_mov_b32 s2, -1
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT: s_bfe_u32 s3, s5, 0xb0014
+; SI-GISEL-NEXT: s_lshr_b32 s8, s5, 8
+; SI-GISEL-NEXT: s_and_b32 s9, s5, 0x1ff
+; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10
+; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe
+; SI-GISEL-NEXT: s_or_b32 s4, s9, s4
+; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
+; SI-GISEL-NEXT: s_or_b32 s4, s8, s4
+; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; SI-GISEL-NEXT: s_cselect_b32 s8, 1, 0
+; SI-GISEL-NEXT: s_lshl_b32 s8, s8, 9
+; SI-GISEL-NEXT: s_lshl_b32 s9, s3, 12
+; SI-GISEL-NEXT: s_sub_i32 s10, 1, s3
+; SI-GISEL-NEXT: s_or_b32 s11, s4, 0x1000
+; SI-GISEL-NEXT: s_or_b32 s8, s8, 0x7c00
+; SI-GISEL-NEXT: s_or_b32 s4, s4, s9
+; SI-GISEL-NEXT: s_max_i32 s9, s10, 0
+; SI-GISEL-NEXT: s_min_i32 s9, s9, 13
+; SI-GISEL-NEXT: s_lshr_b32 s10, s11, s9
+; SI-GISEL-NEXT: s_lshl_b32 s9, s10, s9
+; SI-GISEL-NEXT: s_cmp_lg_u32 s9, s11
+; SI-GISEL-NEXT: s_cselect_b32 s9, 1, 0
+; SI-GISEL-NEXT: s_or_b32 s9, s10, s9
+; SI-GISEL-NEXT: s_cmp_lt_i32 s3, 1
+; SI-GISEL-NEXT: s_cselect_b32 s4, s9, s4
+; SI-GISEL-NEXT: s_and_b32 s9, s4, 7
+; SI-GISEL-NEXT: s_lshr_b32 s4, s4, 2
+; SI-GISEL-NEXT: s_cmp_eq_u32 s9, 3
+; SI-GISEL-NEXT: s_cselect_b32 s10, 1, 0
+; SI-GISEL-NEXT: s_cmp_gt_i32 s9, 5
+; SI-GISEL-NEXT: s_cselect_b32 s9, 1, 0
+; SI-GISEL-NEXT: s_or_b32 s9, s10, s9
+; SI-GISEL-NEXT: s_add_i32 s4, s4, s9
+; SI-GISEL-NEXT: s_cmp_gt_i32 s3, 30
+; SI-GISEL-NEXT: s_cselect_b32 s4, 0x7c00, s4
+; SI-GISEL-NEXT: s_cmpk_eq_i32 s3, 0x40f
+; SI-GISEL-NEXT: s_cselect_b32 s3, s8, s4
+; SI-GISEL-NEXT: s_lshr_b32 s4, s5, 16
+; SI-GISEL-NEXT: s_bfe_u32 s5, s7, 0xb0014
+; SI-GISEL-NEXT: s_lshr_b32 s8, s7, 8
+; SI-GISEL-NEXT: s_and_b32 s9, s7, 0x1ff
+; SI-GISEL-NEXT: s_and_b32 s4, s4, 0x8000
+; SI-GISEL-NEXT: s_addk_i32 s5, 0xfc10
+; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe
+; SI-GISEL-NEXT: s_or_b32 s6, s9, s6
+; SI-GISEL-NEXT: s_or_b32 s3, s4, s3
+; SI-GISEL-NEXT: s_cmp_lg_u32 s6, 0
+; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
+; SI-GISEL-NEXT: s_or_b32 s4, s8, s4
+; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9
+; SI-GISEL-NEXT: s_lshl_b32 s8, s5, 12
+; SI-GISEL-NEXT: s_sub_i32 s9, 1, s5
+; SI-GISEL-NEXT: s_or_b32 s10, s4, 0x1000
+; SI-GISEL-NEXT: s_or_b32 s6, s6, 0x7c00
+; SI-GISEL-NEXT: s_or_b32 s4, s4, s8
+; SI-GISEL-NEXT: s_max_i32 s8, s9, 0
+; SI-GISEL-NEXT: s_min_i32 s8, s8, 13
+; SI-GISEL-NEXT: s_lshr_b32 s9, s10, s8
+; SI-GISEL-NEXT: s_lshl_b32 s8, s9, s8
+; SI-GISEL-NEXT: s_cmp_lg_u32 s8, s10
+; SI-GISEL-NEXT: s_cselect_b32 s8, 1, 0
+; SI-GISEL-NEXT: s_or_b32 s8, s9, s8
+; SI-GISEL-NEXT: s_cmp_lt_i32 s5, 1
+; SI-GISEL-NEXT: s_cselect_b32 s4, s8, s4
+; SI-GISEL-NEXT: s_and_b32 s8, s4, 7
+; SI-GISEL-NEXT: s_lshr_b32 s4, s4, 2
+; SI-GISEL-NEXT: s_cmp_eq_u32 s8, 3
+; SI-GISEL-NEXT: s_cselect_b32 s9, 1, 0
+; SI-GISEL-NEXT: s_cmp_gt_i32 s8, 5
+; SI-GISEL-NEXT: s_cselect_b32 s8, 1, 0
+; SI-GISEL-NEXT: s_or_b32 s8, s9, s8
+; SI-GISEL-NEXT: s_add_i32 s4, s4, s8
+; SI-GISEL-NEXT: s_cmp_gt_i32 s5, 30
+; SI-GISEL-NEXT: s_cselect_b32 s4, 0x7c00, s4
+; SI-GISEL-NEXT: s_cmpk_eq_i32 s5, 0x40f
+; SI-GISEL-NEXT: s_cselect_b32 s4, s6, s4
+; SI-GISEL-NEXT: s_lshr_b32 s5, s7, 16
+; SI-GISEL-NEXT: s_and_b32 s3, s3, 0xffff
+; SI-GISEL-NEXT: s_and_b32 s5, s5, 0x8000
+; SI-GISEL-NEXT: s_or_b32 s4, s5, s4
+; SI-GISEL-NEXT: s_and_b32 s4, s4, 0xffff
+; SI-GISEL-NEXT: s_lshl_b32 s4, s4, 16
+; SI-GISEL-NEXT: s_or_b32 s4, s3, s4
+; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-GISEL-NEXT: s_endpgm
+;
+; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
+; VI-SDAG: ; %bb.0: ; %entry
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
+; VI-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
+; VI-SDAG-NEXT: s_movk_i32 s6, 0x7e00
+; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; VI-SDAG-NEXT: s_and_b32 s7, s4, 0x1ff
+; VI-SDAG-NEXT: v_readfirstlane_b32 s5, v1
+; VI-SDAG-NEXT: v_or_b32_e32 v1, s7, v2
+; VI-SDAG-NEXT: s_lshr_b32 s8, s4, 8
+; VI-SDAG-NEXT: s_bfe_u32 s9, s4, 0xb0014
+; VI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; VI-SDAG-NEXT: s_and_b32 s7, s8, 0xffe
+; VI-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s9
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-SDAG-NEXT: v_med3_i32 v2, s8, 0, 13
+; VI-SDAG-NEXT: v_readfirstlane_b32 s8, v1
+; VI-SDAG-NEXT: s_or_b32 s7, s7, s8
+; VI-SDAG-NEXT: v_readfirstlane_b32 s10, v2
+; VI-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; VI-SDAG-NEXT: s_lshr_b32 s11, s8, s10
+; VI-SDAG-NEXT: s_lshl_b32 s10, s11, s10
+; VI-SDAG-NEXT: s_cmp_lg_u32 s10, s8
+; VI-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; VI-SDAG-NEXT: s_addk_i32 s9, 0xfc10
+; VI-SDAG-NEXT: s_lshl_b32 s10, s9, 12
+; VI-SDAG-NEXT: s_or_b32 s8, s11, s8
+; VI-SDAG-NEXT: s_or_b32 s10, s7, s10
+; VI-SDAG-NEXT: s_cmp_lt_i32 s9, 1
+; VI-SDAG-NEXT: s_cselect_b32 s8, s8, s10
+; VI-SDAG-NEXT: s_and_b32 s10, s8, 7
+; VI-SDAG-NEXT: s_cmp_gt_i32 s10, 5
+; VI-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; VI-SDAG-NEXT: s_cmp_eq_u32 s10, 3
+; VI-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; VI-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; VI-SDAG-NEXT: s_or_b32 s10, s10, s11
+; VI-SDAG-NEXT: s_add_i32 s8, s8, s10
+; VI-SDAG-NEXT: s_cmp_lt_i32 s9, 31
+; VI-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; VI-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; VI-SDAG-NEXT: s_cselect_b32 s7, s6, 0x7c00
+; VI-SDAG-NEXT: s_cmpk_eq_i32 s9, 0x40f
+; VI-SDAG-NEXT: s_cselect_b32 s7, s7, s8
+; VI-SDAG-NEXT: s_and_b32 s8, s5, 0x1ff
+; VI-SDAG-NEXT: v_or_b32_e32 v0, s8, v0
+; VI-SDAG-NEXT: s_lshr_b32 s4, s4, 16
+; VI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; VI-SDAG-NEXT: s_lshr_b32 s9, s5, 8
+; VI-SDAG-NEXT: s_bfe_u32 s10, s5, 0xb0014
+; VI-SDAG-NEXT: s_and_b32 s4, s4, 0x8000
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-SDAG-NEXT: s_and_b32 s8, s9, 0xffe
+; VI-SDAG-NEXT: s_sub_i32 s9, 0x3f1, s10
+; VI-SDAG-NEXT: s_or_b32 s4, s4, s7
+; VI-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; VI-SDAG-NEXT: v_med3_i32 v1, s9, 0, 13
+; VI-SDAG-NEXT: s_or_b32 s7, s8, s7
+; VI-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; VI-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; VI-SDAG-NEXT: s_lshr_b32 s11, s8, s9
+; VI-SDAG-NEXT: s_lshl_b32 s4, s4, 16
+; VI-SDAG-NEXT: s_lshl_b32 s9, s11, s9
+; VI-SDAG-NEXT: s_cmp_lg_u32 s9, s8
+; VI-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; VI-SDAG-NEXT: s_addk_i32 s10, 0xfc10
+; VI-SDAG-NEXT: s_lshl_b32 s9, s10, 12
+; VI-SDAG-NEXT: s_or_b32 s8, s11, s8
+; VI-SDAG-NEXT: s_or_b32 s9, s7, s9
+; VI-SDAG-NEXT: s_cmp_lt_i32 s10, 1
+; VI-SDAG-NEXT: s_cselect_b32 s8, s8, s9
+; VI-SDAG-NEXT: s_and_b32 s9, s8, 7
+; VI-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; VI-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; VI-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; VI-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; VI-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; VI-SDAG-NEXT: s_or_b32 s9, s9, s11
+; VI-SDAG-NEXT: s_add_i32 s8, s8, s9
+; VI-SDAG-NEXT: s_cmp_lt_i32 s10, 31
+; VI-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; VI-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; VI-SDAG-NEXT: s_cselect_b32 s6, s6, 0x7c00
+; VI-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f
+; VI-SDAG-NEXT: s_cselect_b32 s6, s6, s8
+; VI-SDAG-NEXT: s_lshr_b32 s5, s5, 16
+; VI-SDAG-NEXT: s_and_b32 s5, s5, 0x8000
+; VI-SDAG-NEXT: s_or_b32 s5, s5, s6
+; VI-SDAG-NEXT: s_and_b32 s5, s5, 0xffff
+; VI-SDAG-NEXT: s_or_b32 s4, s5, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-SDAG-NEXT: s_endpgm
+;
+; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
+; VI-GISEL: ; %bb.0: ; %entry
+; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: s_bfe_u32 s2, s5, 0xb0014
+; VI-GISEL-NEXT: s_lshr_b32 s3, s5, 8
+; VI-GISEL-NEXT: s_and_b32 s8, s5, 0x1ff
+; VI-GISEL-NEXT: s_addk_i32 s2, 0xfc10
+; VI-GISEL-NEXT: s_and_b32 s3, s3, 0xffe
+; VI-GISEL-NEXT: s_or_b32 s4, s8, s4
+; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
+; VI-GISEL-NEXT: s_or_b32 s3, s3, s4
+; VI-GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
+; VI-GISEL-NEXT: s_sub_i32 s9, 1, s2
+; VI-GISEL-NEXT: s_lshl_b32 s8, s2, 12
+; VI-GISEL-NEXT: s_max_i32 s9, s9, 0
+; VI-GISEL-NEXT: s_or_b32 s8, s3, s8
+; VI-GISEL-NEXT: s_min_i32 s9, s9, 13
+; VI-GISEL-NEXT: s_bitset1_b32 s3, 12
+; VI-GISEL-NEXT: s_lshl_b32 s4, s4, 9
+; VI-GISEL-NEXT: s_lshr_b32 s10, s3, s9
+; VI-GISEL-NEXT: s_or_b32 s4, s4, 0x7c00
+; VI-GISEL-NEXT: s_lshl_b32 s9, s10, s9
+; VI-GISEL-NEXT: s_cmp_lg_u32 s9, s3
+; VI-GISEL-NEXT: s_cselect_b32 s3, 1, 0
+; VI-GISEL-NEXT: s_or_b32 s3, s10, s3
+; VI-GISEL-NEXT: s_cmp_lt_i32 s2, 1
+; VI-GISEL-NEXT: s_cselect_b32 s3, s3, s8
+; VI-GISEL-NEXT: s_and_b32 s8, s3, 7
+; VI-GISEL-NEXT: s_lshr_b32 s3, s3, 2
+; VI-GISEL-NEXT: s_cmp_eq_u32 s8, 3
+; VI-GISEL-NEXT: s_cselect_b32 s9, 1, 0
+; VI-GISEL-NEXT: s_cmp_gt_i32 s8, 5
+; VI-GISEL-NEXT: s_cselect_b32 s8, 1, 0
+; VI-GISEL-NEXT: s_or_b32 s8, s9, s8
+; VI-GISEL-NEXT: s_add_i32 s3, s3, s8
+; VI-GISEL-NEXT: s_cmp_gt_i32 s2, 30
+; VI-GISEL-NEXT: s_cselect_b32 s3, 0x7c00, s3
+; VI-GISEL-NEXT: s_cmpk_eq_i32 s2, 0x40f
+; VI-GISEL-NEXT: s_cselect_b32 s2, s4, s3
+; VI-GISEL-NEXT: s_lshr_b32 s3, s5, 16
+; VI-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
+; VI-GISEL-NEXT: s_or_b32 s2, s3, s2
+; VI-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014
+; VI-GISEL-NEXT: s_lshr_b32 s4, s7, 8
+; VI-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
+; VI-GISEL-NEXT: s_addk_i32 s3, 0xfc10
+; VI-GISEL-NEXT: s_and_b32 s4, s4, 0xffe
+; VI-GISEL-NEXT: s_or_b32 s5, s5, s6
+; VI-GISEL-NEXT: s_cmp_lg_u32 s5, 0
+; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
+; VI-GISEL-NEXT: s_or_b32 s4, s4, s5
+; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
+; VI-GISEL-NEXT: s_sub_i32 s8, 1, s3
+; VI-GISEL-NEXT: s_lshl_b32 s6, s3, 12
+; VI-GISEL-NEXT: s_max_i32 s8, s8, 0
+; VI-GISEL-NEXT: s_or_b32 s6, s4, s6
+; VI-GISEL-NEXT: s_min_i32 s8, s8, 13
+; VI-GISEL-NEXT: s_bitset1_b32 s4, 12
+; VI-GISEL-NEXT: s_lshl_b32 s5, s5, 9
+; VI-GISEL-NEXT: s_lshr_b32 s9, s4, s8
+; VI-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
+; VI-GISEL-NEXT: s_lshl_b32 s8, s9, s8
+; VI-GISEL-NEXT: s_cmp_lg_u32 s8, s4
+; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
+; VI-GISEL-NEXT: s_or_b32 s4, s9, s4
+; VI-GISEL-NEXT: s_cmp_lt_i32 s3, 1
+; VI-GISEL-NEXT: s_cselect_b32 s4, s4, s6
+; VI-GISEL-NEXT: s_and_b32 s6, s4, 7
+; VI-GISEL-NEXT: s_lshr_b32 s4, s4, 2
+; VI-GISEL-NEXT: s_cmp_eq_u32 s6, 3
+; VI-GISEL-NEXT: s_cselect_b32 s8, 1, 0
+; VI-GISEL-NEXT: s_cmp_gt_i32 s6, 5
+; VI-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; VI-GISEL-NEXT: s_or_b32 s6, s8, s6
+; VI-GISEL-NEXT: s_add_i32 s4, s4, s6
+; VI-GISEL-NEXT: s_cmp_gt_i32 s3, 30
+; VI-GISEL-NEXT: s_cselect_b32 s4, 0x7c00, s4
+; VI-GISEL-NEXT: s_cmpk_eq_i32 s3, 0x40f
+; VI-GISEL-NEXT: s_cselect_b32 s3, s5, s4
+; VI-GISEL-NEXT: s_lshr_b32 s4, s7, 16
+; VI-GISEL-NEXT: s_and_b32 s4, s4, 0x8000
+; VI-GISEL-NEXT: s_or_b32 s3, s4, s3
+; VI-GISEL-NEXT: s_and_b32 s3, s3, 0xffff
+; VI-GISEL-NEXT: s_and_b32 s2, s2, 0xffff
+; VI-GISEL-NEXT: s_lshl_b32 s3, s3, 16
+; VI-GISEL-NEXT: s_or_b32 s2, s2, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT: s_mov_b32 s2, -1
+; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-GISEL-NEXT: s_endpgm
+;
+; GFX9-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s7, s3
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT: s_mov_b32 s4, s10
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s11
+; GFX9-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s8
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s9
+; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7e00
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX9-SDAG-NEXT: s_and_b32 s7, s5, 0x1ff
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s6, v1
+; GFX9-SDAG-NEXT: v_or_b32_e32 v1, s7, v2
+; GFX9-SDAG-NEXT: s_lshr_b32 s8, s5, 8
+; GFX9-SDAG-NEXT: s_bfe_u32 s9, s5, 0xb0014
+; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX9-SDAG-NEXT: s_and_b32 s7, s8, 0xffe
+; GFX9-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s9
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-SDAG-NEXT: v_med3_i32 v2, s8, 0, 13
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s8, v1
+; GFX9-SDAG-NEXT: s_or_b32 s7, s7, s8
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s10, v2
+; GFX9-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; GFX9-SDAG-NEXT: s_lshr_b32 s11, s8, s10
+; GFX9-SDAG-NEXT: s_lshl_b32 s10, s11, s10
+; GFX9-SDAG-NEXT: s_cmp_lg_u32 s10, s8
+; GFX9-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; GFX9-SDAG-NEXT: s_addk_i32 s9, 0xfc10
+; GFX9-SDAG-NEXT: s_lshl_b32 s10, s9, 12
+; GFX9-SDAG-NEXT: s_or_b32 s8, s11, s8
+; GFX9-SDAG-NEXT: s_or_b32 s10, s7, s10
+; GFX9-SDAG-NEXT: s_cmp_lt_i32 s9, 1
+; GFX9-SDAG-NEXT: s_cselect_b32 s8, s8, s10
+; GFX9-SDAG-NEXT: s_and_b32 s10, s8, 7
+; GFX9-SDAG-NEXT: s_cmp_gt_i32 s10, 5
+; GFX9-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; GFX9-SDAG-NEXT: s_cmp_eq_u32 s10, 3
+; GFX9-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; GFX9-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; GFX9-SDAG-NEXT: s_or_b32 s10, s10, s11
+; GFX9-SDAG-NEXT: s_add_i32 s8, s8, s10
+; GFX9-SDAG-NEXT: s_cmp_lt_i32 s9, 31
+; GFX9-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; GFX9-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; GFX9-SDAG-NEXT: s_cselect_b32 s7, s4, 0x7c00
+; GFX9-SDAG-NEXT: s_cmpk_eq_i32 s9, 0x40f
+; GFX9-SDAG-NEXT: s_cselect_b32 s7, s7, s8
+; GFX9-SDAG-NEXT: s_and_b32 s8, s6, 0x1ff
+; GFX9-SDAG-NEXT: v_or_b32_e32 v0, s8, v0
+; GFX9-SDAG-NEXT: s_lshr_b32 s5, s5, 16
+; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-SDAG-NEXT: s_lshr_b32 s9, s6, 8
+; GFX9-SDAG-NEXT: s_bfe_u32 s10, s6, 0xb0014
+; GFX9-SDAG-NEXT: s_and_b32 s5, s5, 0x8000
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-SDAG-NEXT: s_and_b32 s8, s9, 0xffe
+; GFX9-SDAG-NEXT: s_sub_i32 s9, 0x3f1, s10
+; GFX9-SDAG-NEXT: s_or_b32 s5, s5, s7
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; GFX9-SDAG-NEXT: v_med3_i32 v1, s9, 0, 13
+; GFX9-SDAG-NEXT: s_or_b32 s7, s8, s7
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; GFX9-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; GFX9-SDAG-NEXT: s_lshr_b32 s11, s8, s9
+; GFX9-SDAG-NEXT: s_lshl_b32 s9, s11, s9
+; GFX9-SDAG-NEXT: s_cmp_lg_u32 s9, s8
+; GFX9-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; GFX9-SDAG-NEXT: s_addk_i32 s10, 0xfc10
+; GFX9-SDAG-NEXT: s_lshl_b32 s9, s10, 12
+; GFX9-SDAG-NEXT: s_or_b32 s8, s11, s8
+; GFX9-SDAG-NEXT: s_or_b32 s9, s7, s9
+; GFX9-SDAG-NEXT: s_cmp_lt_i32 s10, 1
+; GFX9-SDAG-NEXT: s_cselect_b32 s8, s8, s9
+; GFX9-SDAG-NEXT: s_and_b32 s9, s8, 7
+; GFX9-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; GFX9-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; GFX9-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; GFX9-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; GFX9-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; GFX9-SDAG-NEXT: s_or_b32 s9, s9, s11
+; GFX9-SDAG-NEXT: s_add_i32 s8, s8, s9
+; GFX9-SDAG-NEXT: s_cmp_lt_i32 s10, 31
+; GFX9-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; GFX9-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; GFX9-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00
+; GFX9-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f
+; GFX9-SDAG-NEXT: s_cselect_b32 s4, s4, s8
+; GFX9-SDAG-NEXT: s_lshr_b32 s6, s6, 16
+; GFX9-SDAG-NEXT: s_and_b32 s6, s6, 0x8000
+; GFX9-SDAG-NEXT: s_or_b32 s4, s6, s4
+; GFX9-SDAG-NEXT: s_pack_ll_b32_b16 s4, s4, s5
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-SDAG-NEXT: s_endpgm
+;
+; GFX9-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: s_bfe_u32 s2, s5, 0xb0014
+; GFX9-GISEL-NEXT: s_lshr_b32 s3, s5, 8
+; GFX9-GISEL-NEXT: s_and_b32 s8, s5, 0x1ff
+; GFX9-GISEL-NEXT: s_addk_i32 s2, 0xfc10
+; GFX9-GISEL-NEXT: s_and_b32 s3, s3, 0xffe
+; GFX9-GISEL-NEXT: s_or_b32 s4, s8, s4
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0
+; GFX9-GISEL-NEXT: s_or_b32 s3, s3, s4
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0
+; GFX9-GISEL-NEXT: s_sub_i32 s9, 1, s2
+; GFX9-GISEL-NEXT: s_lshl_b32 s8, s2, 12
+; GFX9-GISEL-NEXT: s_max_i32 s9, s9, 0
+; GFX9-GISEL-NEXT: s_or_b32 s8, s3, s8
+; GFX9-GISEL-NEXT: s_min_i32 s9, s9, 13
+; GFX9-GISEL-NEXT: s_bitset1_b32 s3, 12
+; GFX9-GISEL-NEXT: s_lshl_b32 s4, s4, 9
+; GFX9-GISEL-NEXT: s_lshr_b32 s10, s3, s9
+; GFX9-GISEL-NEXT: s_or_b32 s4, s4, 0x7c00
+; GFX9-GISEL-NEXT: s_lshl_b32 s9, s10, s9
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s9, s3
+; GFX9-GISEL-NEXT: s_cselect_b32 s3, 1, 0
+; GFX9-GISEL-NEXT: s_or_b32 s3, s10, s3
+; GFX9-GISEL-NEXT: s_cmp_lt_i32 s2, 1
+; GFX9-GISEL-NEXT: s_cselect_b32 s3, s3, s8
+; GFX9-GISEL-NEXT: s_and_b32 s8, s3, 7
+; GFX9-GISEL-NEXT: s_lshr_b32 s3, s3, 2
+; GFX9-GISEL-NEXT: s_cmp_eq_u32 s8, 3
+; GFX9-GISEL-NEXT: s_cselect_b32 s9, 1, 0
+; GFX9-GISEL-NEXT: s_cmp_gt_i32 s8, 5
+; GFX9-GISEL-NEXT: s_cselect_b32 s8, 1, 0
+; GFX9-GISEL-NEXT: s_or_b32 s8, s9, s8
+; GFX9-GISEL-NEXT: s_add_i32 s3, s3, s8
+; GFX9-GISEL-NEXT: s_cmp_gt_i32 s2, 30
+; GFX9-GISEL-NEXT: s_cselect_b32 s3, 0x7c00, s3
+; GFX9-GISEL-NEXT: s_cmpk_eq_i32 s2, 0x40f
+; GFX9-GISEL-NEXT: s_cselect_b32 s2, s4, s3
+; GFX9-GISEL-NEXT: s_lshr_b32 s3, s5, 16
+; GFX9-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
+; GFX9-GISEL-NEXT: s_or_b32 s2, s3, s2
+; GFX9-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014
+; GFX9-GISEL-NEXT: s_lshr_b32 s4, s7, 8
+; GFX9-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
+; GFX9-GISEL-NEXT: s_addk_i32 s3, 0xfc10
+; GFX9-GISEL-NEXT: s_and_b32 s4, s4, 0xffe
+; GFX9-GISEL-NEXT: s_or_b32 s5, s5, s6
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s5, 0
+; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0
+; GFX9-GISEL-NEXT: s_or_b32 s4, s4, s5
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0
+; GFX9-GISEL-NEXT: s_sub_i32 s8, 1, s3
+; GFX9-GISEL-NEXT: s_lshl_b32 s6, s3, 12
+; GFX9-GISEL-NEXT: s_max_i32 s8, s8, 0
+; GFX9-GISEL-NEXT: s_or_b32 s6, s4, s6
+; GFX9-GISEL-NEXT: s_min_i32 s8, s8, 13
+; GFX9-GISEL-NEXT: s_bitset1_b32 s4, 12
+; GFX9-GISEL-NEXT: s_lshl_b32 s5, s5, 9
+; GFX9-GISEL-NEXT: s_lshr_b32 s9, s4, s8
+; GFX9-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
+; GFX9-GISEL-NEXT: s_lshl_b32 s8, s9, s8
+; GFX9-GISEL-NEXT: s_cmp_lg_u32 s8, s4
+; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0
+; GFX9-GISEL-NEXT: s_or_b32 s4, s9, s4
+; GFX9-GISEL-NEXT: s_cmp_lt_i32 s3, 1
+; GFX9-GISEL-NEXT: s_cselect_b32 s4, s4, s6
+; GFX9-GISEL-NEXT: s_and_b32 s6, s4, 7
+; GFX9-GISEL-NEXT: s_lshr_b32 s4, s4, 2
+; GFX9-GISEL-NEXT: s_cmp_eq_u32 s6, 3
+; GFX9-GISEL-NEXT: s_cselect_b32 s8, 1, 0
+; GFX9-GISEL-NEXT: s_cmp_gt_i32 s6, 5
+; GFX9-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; GFX9-GISEL-NEXT: s_or_b32 s6, s8, s6
+; GFX9-GISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX9-GISEL-NEXT: s_cmp_gt_i32 s3, 30
+; GFX9-GISEL-NEXT: s_cselect_b32 s4, 0x7c00, s4
+; GFX9-GISEL-NEXT: s_cmpk_eq_i32 s3, 0x40f
+; GFX9-GISEL-NEXT: s_cselect_b32 s3, s5, s4
+; GFX9-GISEL-NEXT: s_lshr_b32 s4, s7, 16
+; GFX9-GISEL-NEXT: s_and_b32 s4, s4, 0x8000
+; GFX9-GISEL-NEXT: s_or_b32 s3, s4, s3
+; GFX9-GISEL-NEXT: s_pack_ll_b32_b16 s2, s2, s3
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX950-SDAG: ; %bb.0: ; %entry
+; GFX950-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX950-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX950-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX950-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX950-SDAG-NEXT: s_mov_b32 s7, s3
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_mov_b32 s4, s10
+; GFX950-SDAG-NEXT: s_mov_b32 s5, s11
+; GFX950-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX950-SDAG-NEXT: s_mov_b32 s0, s8
+; GFX950-SDAG-NEXT: s_mov_b32 s1, s9
+; GFX950-SDAG-NEXT: s_movk_i32 s4, 0x7e00
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX950-SDAG-NEXT: s_and_b32 s7, s5, 0x1ff
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s6, v1
+; GFX950-SDAG-NEXT: v_or_b32_e32 v1, s7, v2
+; GFX950-SDAG-NEXT: s_lshr_b32 s8, s5, 8
+; GFX950-SDAG-NEXT: s_bfe_u32 s9, s5, 0xb0014
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX950-SDAG-NEXT: s_and_b32 s7, s8, 0xffe
+; GFX950-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s9
+; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX950-SDAG-NEXT: v_med3_i32 v2, s8, 0, 13
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s8, v1
+; GFX950-SDAG-NEXT: s_or_b32 s7, s7, s8
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s10, v2
+; GFX950-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; GFX950-SDAG-NEXT: s_lshr_b32 s11, s8, s10
+; GFX950-SDAG-NEXT: s_lshl_b32 s10, s11, s10
+; GFX950-SDAG-NEXT: s_cmp_lg_u32 s10, s8
+; GFX950-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; GFX950-SDAG-NEXT: s_addk_i32 s9, 0xfc10
+; GFX950-SDAG-NEXT: s_lshl_b32 s10, s9, 12
+; GFX950-SDAG-NEXT: s_or_b32 s8, s11, s8
+; GFX950-SDAG-NEXT: s_or_b32 s10, s7, s10
+; GFX950-SDAG-NEXT: s_cmp_lt_i32 s9, 1
+; GFX950-SDAG-NEXT: s_cselect_b32 s8, s8, s10
+; GFX950-SDAG-NEXT: s_and_b32 s10, s8, 7
+; GFX950-SDAG-NEXT: s_cmp_gt_i32 s10, 5
+; GFX950-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; GFX950-SDAG-NEXT: s_cmp_eq_u32 s10, 3
+; GFX950-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; GFX950-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; GFX950-SDAG-NEXT: s_or_b32 s10, s10, s11
+; GFX950-SDAG-NEXT: s_add_i32 s8, s8, s10
+; GFX950-SDAG-NEXT: s_cmp_lt_i32 s9, 31
+; GFX950-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; GFX950-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; GFX950-SDAG-NEXT: s_cselect_b32 s7, s4, 0x7c00
+; GFX950-SDAG-NEXT: s_cmpk_eq_i32 s9, 0x40f
+; GFX950-SDAG-NEXT: s_cselect_b32 s7, s7, s8
+; GFX950-SDAG-NEXT: s_and_b32 s8, s6, 0x1ff
+; GFX950-SDAG-NEXT: v_or_b32_e32 v0, s8, v0
+; GFX950-SDAG-NEXT: s_lshr_b32 s5, s5, 16
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX950-SDAG-NEXT: s_lshr_b32 s9, s6, 8
+; GFX950-SDAG-NEXT: s_bfe_u32 s10, s6, 0xb0014
+; GFX950-SDAG-NEXT: s_and_b32 s5, s5, 0x8000
+; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-SDAG-NEXT: s_and_b32 s8, s9, 0xffe
+; GFX950-SDAG-NEXT: s_sub_i32 s9, 0x3f1, s10
+; GFX950-SDAG-NEXT: s_or_b32 s5, s5, s7
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; GFX950-SDAG-NEXT: v_med3_i32 v1, s9, 0, 13
+; GFX950-SDAG-NEXT: s_or_b32 s7, s8, s7
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; GFX950-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; GFX950-SDAG-NEXT: s_lshr_b32 s11, s8, s9
+; GFX950-SDAG-NEXT: s_lshl_b32 s9, s11, s9
+; GFX950-SDAG-NEXT: s_cmp_lg_u32 s9, s8
+; GFX950-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; GFX950-SDAG-NEXT: s_addk_i32 s10, 0xfc10
+; GFX950-SDAG-NEXT: s_lshl_b32 s9, s10, 12
+; GFX950-SDAG-NEXT: s_or_b32 s8, s11, s8
+; GFX950-SDAG-NEXT: s_or_b32 s9, s7, s9
+; GFX950-SDAG-NEXT: s_cmp_lt_i32 s10, 1
+; GFX950-SDAG-NEXT: s_cselect_b32 s8, s8, s9
+; GFX950-SDAG-NEXT: s_and_b32 s9, s8, 7
+; GFX950-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; GFX950-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; GFX950-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; GFX950-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; GFX950-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; GFX950-SDAG-NEXT: s_or_b32 s9, s9, s11
+; GFX950-SDAG-NEXT: s_add_i32 s8, s8, s9
+; GFX950-SDAG-NEXT: s_cmp_lt_i32 s10, 31
+; GFX950-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; GFX950-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; GFX950-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00
+; GFX950-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f
+; GFX950-SDAG-NEXT: s_cselect_b32 s4, s4, s8
+; GFX950-SDAG-NEXT: s_lshr_b32 s6, s6, 16
+; GFX950-SDAG-NEXT: s_and_b32 s6, s6, 0x8000
+; GFX950-SDAG-NEXT: s_or_b32 s4, s6, s4
+; GFX950-SDAG-NEXT: s_pack_ll_b32_b16 s4, s4, s5
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX950-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX950-GISEL: ; %bb.0: ; %entry
+; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_bfe_u32 s2, s5, 0xb0014
+; GFX950-GISEL-NEXT: s_lshr_b32 s3, s5, 8
+; GFX950-GISEL-NEXT: s_and_b32 s8, s5, 0x1ff
+; GFX950-GISEL-NEXT: s_addk_i32 s2, 0xfc10
+; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0xffe
+; GFX950-GISEL-NEXT: s_or_b32 s4, s8, s4
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0
+; GFX950-GISEL-NEXT: s_or_b32 s3, s3, s4
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0
+; GFX950-GISEL-NEXT: s_sub_i32 s9, 1, s2
+; GFX950-GISEL-NEXT: s_lshl_b32 s8, s2, 12
+; GFX950-GISEL-NEXT: s_max_i32 s9, s9, 0
+; GFX950-GISEL-NEXT: s_or_b32 s8, s3, s8
+; GFX950-GISEL-NEXT: s_min_i32 s9, s9, 13
+; GFX950-GISEL-NEXT: s_bitset1_b32 s3, 12
+; GFX950-GISEL-NEXT: s_lshl_b32 s4, s4, 9
+; GFX950-GISEL-NEXT: s_lshr_b32 s10, s3, s9
+; GFX950-GISEL-NEXT: s_or_b32 s4, s4, 0x7c00
+; GFX950-GISEL-NEXT: s_lshl_b32 s9, s10, s9
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s9, s3
+; GFX950-GISEL-NEXT: s_cselect_b32 s3, 1, 0
+; GFX950-GISEL-NEXT: s_or_b32 s3, s10, s3
+; GFX950-GISEL-NEXT: s_cmp_lt_i32 s2, 1
+; GFX950-GISEL-NEXT: s_cselect_b32 s3, s3, s8
+; GFX950-GISEL-NEXT: s_and_b32 s8, s3, 7
+; GFX950-GISEL-NEXT: s_lshr_b32 s3, s3, 2
+; GFX950-GISEL-NEXT: s_cmp_eq_u32 s8, 3
+; GFX950-GISEL-NEXT: s_cselect_b32 s9, 1, 0
+; GFX950-GISEL-NEXT: s_cmp_gt_i32 s8, 5
+; GFX950-GISEL-NEXT: s_cselect_b32 s8, 1, 0
+; GFX950-GISEL-NEXT: s_or_b32 s8, s9, s8
+; GFX950-GISEL-NEXT: s_add_i32 s3, s3, s8
+; GFX950-GISEL-NEXT: s_cmp_gt_i32 s2, 30
+; GFX950-GISEL-NEXT: s_cselect_b32 s3, 0x7c00, s3
+; GFX950-GISEL-NEXT: s_cmpk_eq_i32 s2, 0x40f
+; GFX950-GISEL-NEXT: s_cselect_b32 s2, s4, s3
+; GFX950-GISEL-NEXT: s_lshr_b32 s3, s5, 16
+; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
+; GFX950-GISEL-NEXT: s_or_b32 s2, s3, s2
+; GFX950-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014
+; GFX950-GISEL-NEXT: s_lshr_b32 s4, s7, 8
+; GFX950-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
+; GFX950-GISEL-NEXT: s_addk_i32 s3, 0xfc10
+; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0xffe
+; GFX950-GISEL-NEXT: s_or_b32 s5, s5, s6
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s5, 0
+; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0
+; GFX950-GISEL-NEXT: s_or_b32 s4, s4, s5
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0
+; GFX950-GISEL-NEXT: s_sub_i32 s8, 1, s3
+; GFX950-GISEL-NEXT: s_lshl_b32 s6, s3, 12
+; GFX950-GISEL-NEXT: s_max_i32 s8, s8, 0
+; GFX950-GISEL-NEXT: s_or_b32 s6, s4, s6
+; GFX950-GISEL-NEXT: s_min_i32 s8, s8, 13
+; GFX950-GISEL-NEXT: s_bitset1_b32 s4, 12
+; GFX950-GISEL-NEXT: s_lshl_b32 s5, s5, 9
+; GFX950-GISEL-NEXT: s_lshr_b32 s9, s4, s8
+; GFX950-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
+; GFX950-GISEL-NEXT: s_lshl_b32 s8, s9, s8
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s8, s4
+; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0
+; GFX950-GISEL-NEXT: s_or_b32 s4, s9, s4
+; GFX950-GISEL-NEXT: s_cmp_lt_i32 s3, 1
+; GFX950-GISEL-NEXT: s_cselect_b32 s4, s4, s6
+; GFX950-GISEL-NEXT: s_and_b32 s6, s4, 7
+; GFX950-GISEL-NEXT: s_lshr_b32 s4, s4, 2
+; GFX950-GISEL-NEXT: s_cmp_eq_u32 s6, 3
+; GFX950-GISEL-NEXT: s_cselect_b32 s8, 1, 0
+; GFX950-GISEL-NEXT: s_cmp_gt_i32 s6, 5
+; GFX950-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; GFX950-GISEL-NEXT: s_or_b32 s6, s8, s6
+; GFX950-GISEL-NEXT: s_add_i32 s4, s4, s6
+; GFX950-GISEL-NEXT: s_cmp_gt_i32 s3, 30
+; GFX950-GISEL-NEXT: s_cselect_b32 s4, 0x7c00, s4
+; GFX950-GISEL-NEXT: s_cmpk_eq_i32 s3, 0x40f
+; GFX950-GISEL-NEXT: s_cselect_b32 s3, s5, s4
+; GFX950-GISEL-NEXT: s_lshr_b32 s4, s7, 16
+; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0x8000
+; GFX950-GISEL-NEXT: s_or_b32 s3, s4, s3
+; GFX950-GISEL-NEXT: s_pack_ll_b32_b16 s2, s2, s3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX950-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX950-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX950-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-TRUE16-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-SDAG-TRUE16-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s2, v3
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s3, s2, 0x1ff
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s5, s2, 8
+; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, s3, v2
+; GFX11-SDAG-TRUE16-NEXT: s_bfe_u32 s3, s2, 0xb0014
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-SDAG-TRUE16-NEXT: s_sub_i32 s4, 0x3f1, s3
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, s4, 0, 13
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s8, v3
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s4, v2
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, s4, 0x1000
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s9, s5, s8
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s8, s9, s8
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s8, s5
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_addk_i32 s3, 0xfc10
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, s9, s5
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s8, s3, 12
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s8, s4, s8
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s3, 1
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, s5, s8
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s8, s5, 7
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_gt_i32 s8, 5
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s9, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_eq_u32 s8, 3
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s8, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s5, s5, 2
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_add_i32 s5, s5, s8
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s3, 31
+; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s8, 0x7e00
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s9, s8, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT: s_cmpk_eq_i32 s3, 0x40f
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s9, s5
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s5, s4, 0x1ff
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s10, s4, 8
+; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, s5, v0
+; GFX11-SDAG-TRUE16-NEXT: s_bfe_u32 s5, s4, 0xb0014
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s10, s10, 0xffe
+; GFX11-SDAG-TRUE16-NEXT: s_sub_i32 s9, 0x3f1, s5
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v1, s9, 0, 13
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s2, s2, 0x8000
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s11, v1
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s9, v0
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s9, s10, s9
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s10, s9, 0x1000
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s12, s10, s11
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s11, s12, s11
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s11, s10
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_addk_i32 s5, 0xfc10
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s3, s12, s3
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s10, s5, 12
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s10, s9, s10
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s5, 1
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s3, s10
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s10, s3, 7
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_gt_i32 s10, 5
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s11, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_eq_u32 s10, 3
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s10, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s3, s3, 2
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s10, s10, s11
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_add_i32 s3, s3, s10
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s5, 31
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s3, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s9, 0
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT: s_cmpk_eq_i32 s5, 0x40f
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s8, s3
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s4, s4, 16
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s4, s4, 0x8000
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s3, s4, s3
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s3, s2
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SDAG-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-SDAG-FAKE16-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s2, v3
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s3, s2, 0x1ff
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s5, s2, 8
+; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v2, s3, v2
+; GFX11-SDAG-FAKE16-NEXT: s_bfe_u32 s3, s2, 0xb0014
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-SDAG-FAKE16-NEXT: s_sub_i32 s4, 0x3f1, s3
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v3, s4, 0, 13
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s8, v3
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s4, v2
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s5, s4, 0x1000
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s9, s5, s8
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s8, s9, s8
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s8, s5
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_addk_i32 s3, 0xfc10
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s5, s9, s5
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s8, s3, 12
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s8, s4, s8
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s3, 1
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, s5, s8
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s8, s5, 7
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_gt_i32 s8, 5
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s9, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_eq_u32 s8, 3
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s8, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s5, s5, 2
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_add_i32 s5, s5, s8
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s3, 31
+; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s8, 0x7e00
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s9, s8, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT: s_cmpk_eq_i32 s3, 0x40f
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s9, s5
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s5, s4, 0x1ff
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s10, s4, 8
+; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, s5, v0
+; GFX11-SDAG-FAKE16-NEXT: s_bfe_u32 s5, s4, 0xb0014
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s10, s10, 0xffe
+; GFX11-SDAG-FAKE16-NEXT: s_sub_i32 s9, 0x3f1, s5
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v1, s9, 0, 13
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s2, s2, 0x8000
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s11, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s9, v0
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s9, s10, s9
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s10, s9, 0x1000
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s12, s10, s11
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s11, s12, s11
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s11, s10
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_addk_i32 s5, 0xfc10
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s3, s12, s3
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s10, s5, 12
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s10, s9, s10
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s5, 1
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s3, s10
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s10, s3, 7
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_gt_i32 s10, 5
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s11, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_eq_u32 s10, 3
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s10, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s3, s3, 2
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s10, s10, s11
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_add_i32 s3, s3, s10
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s5, 31
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s3, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s9, 0
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT: s_cmpk_eq_i32 s5, 0x40f
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s8, s3
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s4, s4, 16
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s4, s4, 0x8000
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s3, s4, s3
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s3, s2
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SDAG-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-FAKE16-NEXT: s_endpgm
+;
+; GFX11-GISEL-TRUE16-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff
+; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
+; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 8
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4
+; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s2, 0xfc10
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0xffe
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s4
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s8, 1, s2
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s10, s3, 0x1000
+; GFX11-GISEL-TRUE16-NEXT: s_max_i32 s8, s8, 0
+; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s9, s2, 12
+; GFX11-GISEL-TRUE16-NEXT: s_min_i32 s8, s8, 13
+; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s4, s4, 9
+; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s11, s10, s8
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s9
+; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s8, s11, s8
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s4, 0x7c00
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s8, s10
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s8, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s8, s11, s8
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lt_i32 s2, 1
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, s8, s3
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s3, 7
+; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s3, 2
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_eq_u32 s8, 3
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s9, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_gt_i32 s8, 5
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s8, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s8, s9, s8
+; GFX11-GISEL-TRUE16-NEXT: s_add_i32 s3, s3, s8
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_gt_i32 s2, 30
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 0x7c00, s3
+; GFX11-GISEL-TRUE16-NEXT: s_cmpk_eq_i32 s2, 0x40f
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s4, s3
+; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 16
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff
+; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
+; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s7, 8
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6
+; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, 0
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s5, s3
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s9, s3, 0x1000
+; GFX11-GISEL-TRUE16-NEXT: s_max_i32 s6, s6, 0
+; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s8, s4, 12
+; GFX11-GISEL-TRUE16-NEXT: s_min_i32 s6, s6, 13
+; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s5, s5, 9
+; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s10, s9, s6
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s8
+; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s6, s10, s6
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s5, s5, 0x7c00
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, s9
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s10, s6
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_lt_i32 s4, 1
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, s6, s3
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 7
+; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s3, 2
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_eq_u32 s6, 3
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s8, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_gt_i32 s6, 5
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6
+; GFX11-GISEL-TRUE16-NEXT: s_add_i32 s3, s3, s6
+; GFX11-GISEL-TRUE16-NEXT: s_cmp_gt_i32 s4, 30
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 0x7c00, s3
+; GFX11-GISEL-TRUE16-NEXT: s_cmpk_eq_i32 s4, 0x40f
+; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, s5, s3
+; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s4, s7, 16
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s4, s4, 0x8000
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s4, s3
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s3
+; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX11-GISEL-FAKE16-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff
+; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
+; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 8
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4
+; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s2, 0xfc10
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0xffe
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s4
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s8, 1, s2
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s10, s3, 0x1000
+; GFX11-GISEL-FAKE16-NEXT: s_max_i32 s8, s8, 0
+; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s9, s2, 12
+; GFX11-GISEL-FAKE16-NEXT: s_min_i32 s8, s8, 13
+; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s4, s4, 9
+; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s11, s10, s8
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s9
+; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s8, s11, s8
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s4, 0x7c00
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s8, s10
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s8, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s8, s11, s8
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lt_i32 s2, 1
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, s8, s3
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s3, 7
+; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s3, 2
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_eq_u32 s8, 3
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s9, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_gt_i32 s8, 5
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s8, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s8, s9, s8
+; GFX11-GISEL-FAKE16-NEXT: s_add_i32 s3, s3, s8
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_gt_i32 s2, 30
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 0x7c00, s3
+; GFX11-GISEL-FAKE16-NEXT: s_cmpk_eq_i32 s2, 0x40f
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s4, s3
+; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 16
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff
+; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
+; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s7, 8
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6
+; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, 0
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s5, s3
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s9, s3, 0x1000
+; GFX11-GISEL-FAKE16-NEXT: s_max_i32 s6, s6, 0
+; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s8, s4, 12
+; GFX11-GISEL-FAKE16-NEXT: s_min_i32 s6, s6, 13
+; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s5, s5, 9
+; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s10, s9, s6
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s8
+; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s6, s10, s6
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s5, s5, 0x7c00
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, s9
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s10, s6
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_lt_i32 s4, 1
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, s6, s3
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 7
+; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s3, 2
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_eq_u32 s6, 3
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s8, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_gt_i32 s6, 5
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6
+; GFX11-GISEL-FAKE16-NEXT: s_add_i32 s3, s3, s6
+; GFX11-GISEL-FAKE16-NEXT: s_cmp_gt_i32 s4, 30
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 0x7c00, s3
+; GFX11-GISEL-FAKE16-NEXT: s_cmpk_eq_i32 s4, 0x40f
+; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, s5, s3
+; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s4, s7, 16
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s4, s4, 0x8000
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s4, s3
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s3
+; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-FAKE16-NEXT: s_endpgm
+ ptr addrspace(1) %r,
+ ptr addrspace(1) %a) {
+entry:
+ %a.val = load <2 x double>, ptr addrspace(1) %a
+ %r.val = fptrunc <2 x double> %a.val to <2 x half>
+ store <2 x half> %r.val, ptr addrspace(1) %r
+ ret void
+}
+
+define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn(
+; SI-SDAG-LABEL: fptrunc_v2f64_to_v2f16_afn:
+; SI-SDAG: ; %bb.0: ; %entry
+; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT: s_mov_b32 s2, -1
+; SI-SDAG-NEXT: s_mov_b32 s10, s2
+; SI-SDAG-NEXT: s_mov_b32 s11, s3
+; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT: s_mov_b32 s8, s6
+; SI-SDAG-NEXT: s_mov_b32 s9, s7
+; SI-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SI-SDAG-NEXT: s_movk_i32 s0, 0x7e00
+; SI-SDAG-NEXT: s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; SI-SDAG-NEXT: v_readfirstlane_b32 s6, v1
+; SI-SDAG-NEXT: s_and_b32 s7, s1, 0x1ff
+; SI-SDAG-NEXT: s_lshr_b32 s8, s1, 8
+; SI-SDAG-NEXT: s_bfe_u32 s9, s1, 0xb0014
+; SI-SDAG-NEXT: v_or_b32_e32 v1, s7, v2
+; SI-SDAG-NEXT: s_and_b32 s7, s8, 0xffe
+; SI-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s9
+; SI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-SDAG-NEXT: v_med3_i32 v2, s8, 0, 13
+; SI-SDAG-NEXT: v_readfirstlane_b32 s8, v1
+; SI-SDAG-NEXT: v_readfirstlane_b32 s10, v2
+; SI-SDAG-NEXT: s_or_b32 s7, s7, s8
+; SI-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; SI-SDAG-NEXT: s_lshr_b32 s11, s8, s10
+; SI-SDAG-NEXT: s_lshl_b32 s10, s11, s10
+; SI-SDAG-NEXT: s_cmp_lg_u32 s10, s8
+; SI-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; SI-SDAG-NEXT: s_addk_i32 s9, 0xfc10
+; SI-SDAG-NEXT: s_or_b32 s8, s11, s8
+; SI-SDAG-NEXT: s_lshl_b32 s10, s9, 12
+; SI-SDAG-NEXT: s_or_b32 s10, s7, s10
+; SI-SDAG-NEXT: s_cmp_lt_i32 s9, 1
+; SI-SDAG-NEXT: s_cselect_b32 s8, s8, s10
+; SI-SDAG-NEXT: s_and_b32 s10, s8, 7
+; SI-SDAG-NEXT: s_cmp_gt_i32 s10, 5
+; SI-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; SI-SDAG-NEXT: s_cmp_eq_u32 s10, 3
+; SI-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; SI-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; SI-SDAG-NEXT: s_or_b32 s10, s10, s11
+; SI-SDAG-NEXT: s_add_i32 s8, s8, s10
+; SI-SDAG-NEXT: s_cmp_lt_i32 s9, 31
+; SI-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; SI-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; SI-SDAG-NEXT: s_cselect_b32 s7, s0, 0x7c00
+; SI-SDAG-NEXT: s_cmpk_eq_i32 s9, 0x40f
+; SI-SDAG-NEXT: s_cselect_b32 s7, s7, s8
+; SI-SDAG-NEXT: s_lshr_b32 s1, s1, 16
+; SI-SDAG-NEXT: s_and_b32 s8, s6, 0x1ff
+; SI-SDAG-NEXT: s_lshr_b32 s9, s6, 8
+; SI-SDAG-NEXT: s_bfe_u32 s10, s6, 0xb0014
+; SI-SDAG-NEXT: s_and_b32 s1, s1, 0x8000
+; SI-SDAG-NEXT: v_or_b32_e32 v0, s8, v0
+; SI-SDAG-NEXT: s_and_b32 s8, s9, 0xffe
+; SI-SDAG-NEXT: s_sub_i32 s9, 0x3f1, s10
+; SI-SDAG-NEXT: s_or_b32 s1, s1, s7
+; SI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-SDAG-NEXT: v_med3_i32 v1, s9, 0, 13
+; SI-SDAG-NEXT: s_lshl_b32 s1, s1, 16
+; SI-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; SI-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; SI-SDAG-NEXT: s_or_b32 s7, s8, s7
+; SI-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; SI-SDAG-NEXT: s_lshr_b32 s11, s8, s9
+; SI-SDAG-NEXT: s_lshl_b32 s9, s11, s9
+; SI-SDAG-NEXT: s_cmp_lg_u32 s9, s8
+; SI-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; SI-SDAG-NEXT: s_addk_i32 s10, 0xfc10
+; SI-SDAG-NEXT: s_or_b32 s8, s11, s8
+; SI-SDAG-NEXT: s_lshl_b32 s9, s10, 12
+; SI-SDAG-NEXT: s_or_b32 s9, s7, s9
+; SI-SDAG-NEXT: s_cmp_lt_i32 s10, 1
+; SI-SDAG-NEXT: s_cselect_b32 s8, s8, s9
+; SI-SDAG-NEXT: s_and_b32 s9, s8, 7
+; SI-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; SI-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; SI-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; SI-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; SI-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; SI-SDAG-NEXT: s_or_b32 s9, s9, s11
+; SI-SDAG-NEXT: s_add_i32 s8, s8, s9
+; SI-SDAG-NEXT: s_cmp_lt_i32 s10, 31
+; SI-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; SI-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; SI-SDAG-NEXT: s_cselect_b32 s0, s0, 0x7c00
+; SI-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f
+; SI-SDAG-NEXT: s_cselect_b32 s0, s0, s8
+; SI-SDAG-NEXT: s_lshr_b32 s6, s6, 16
+; SI-SDAG-NEXT: s_and_b32 s6, s6, 0x8000
+; SI-SDAG-NEXT: s_or_b32 s0, s6, s0
+; SI-SDAG-NEXT: s_and_b32 s0, s0, 0xffff
+; SI-SDAG-NEXT: s_or_b32 s6, s0, s1
+; SI-SDAG-NEXT: s_mov_b32 s0, s4
+; SI-SDAG-NEXT: s_mov_b32 s1, s5
+; SI-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; SI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-SDAG-NEXT: s_endpgm
+;
+; SI-GISEL-LABEL: fptrunc_v2f64_to_v2f16_afn:
+; SI-GISEL: ; %bb.0: ; %entry
+; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; SI-GISEL-NEXT: s_mov_b32 s2, -1
+; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; SI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
@@ -664,29 +3145,111 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-GISEL-NEXT: s_endpgm
;
-; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
+; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f16_afn:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
+; VI-SDAG-NEXT: s_movk_i32 s6, 0x7e00
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
-; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
-; VI-SDAG-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-SDAG-NEXT: v_readfirstlane_b32 s4, v3
+; VI-SDAG-NEXT: s_and_b32 s7, s4, 0x1ff
+; VI-SDAG-NEXT: v_readfirstlane_b32 s5, v1
+; VI-SDAG-NEXT: v_or_b32_e32 v1, s7, v2
+; VI-SDAG-NEXT: s_lshr_b32 s8, s4, 8
+; VI-SDAG-NEXT: s_bfe_u32 s9, s4, 0xb0014
+; VI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; VI-SDAG-NEXT: s_and_b32 s7, s8, 0xffe
+; VI-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s9
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-SDAG-NEXT: v_med3_i32 v2, s8, 0, 13
+; VI-SDAG-NEXT: v_readfirstlane_b32 s8, v1
+; VI-SDAG-NEXT: s_or_b32 s7, s7, s8
+; VI-SDAG-NEXT: v_readfirstlane_b32 s10, v2
+; VI-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; VI-SDAG-NEXT: s_lshr_b32 s11, s8, s10
+; VI-SDAG-NEXT: s_lshl_b32 s10, s11, s10
+; VI-SDAG-NEXT: s_cmp_lg_u32 s10, s8
+; VI-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; VI-SDAG-NEXT: s_addk_i32 s9, 0xfc10
+; VI-SDAG-NEXT: s_lshl_b32 s10, s9, 12
+; VI-SDAG-NEXT: s_or_b32 s8, s11, s8
+; VI-SDAG-NEXT: s_or_b32 s10, s7, s10
+; VI-SDAG-NEXT: s_cmp_lt_i32 s9, 1
+; VI-SDAG-NEXT: s_cselect_b32 s8, s8, s10
+; VI-SDAG-NEXT: s_and_b32 s10, s8, 7
+; VI-SDAG-NEXT: s_cmp_gt_i32 s10, 5
+; VI-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; VI-SDAG-NEXT: s_cmp_eq_u32 s10, 3
+; VI-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; VI-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; VI-SDAG-NEXT: s_or_b32 s10, s10, s11
+; VI-SDAG-NEXT: s_add_i32 s8, s8, s10
+; VI-SDAG-NEXT: s_cmp_lt_i32 s9, 31
+; VI-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; VI-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; VI-SDAG-NEXT: s_cselect_b32 s7, s6, 0x7c00
+; VI-SDAG-NEXT: s_cmpk_eq_i32 s9, 0x40f
+; VI-SDAG-NEXT: s_cselect_b32 s7, s7, s8
+; VI-SDAG-NEXT: s_and_b32 s8, s5, 0x1ff
+; VI-SDAG-NEXT: v_or_b32_e32 v0, s8, v0
+; VI-SDAG-NEXT: s_lshr_b32 s4, s4, 16
+; VI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; VI-SDAG-NEXT: s_lshr_b32 s9, s5, 8
+; VI-SDAG-NEXT: s_bfe_u32 s10, s5, 0xb0014
+; VI-SDAG-NEXT: s_and_b32 s4, s4, 0x8000
+; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-SDAG-NEXT: s_and_b32 s8, s9, 0xffe
+; VI-SDAG-NEXT: s_sub_i32 s9, 0x3f1, s10
+; VI-SDAG-NEXT: s_or_b32 s4, s4, s7
+; VI-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; VI-SDAG-NEXT: v_med3_i32 v1, s9, 0, 13
+; VI-SDAG-NEXT: s_or_b32 s7, s8, s7
+; VI-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; VI-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; VI-SDAG-NEXT: s_lshr_b32 s11, s8, s9
+; VI-SDAG-NEXT: s_lshl_b32 s4, s4, 16
+; VI-SDAG-NEXT: s_lshl_b32 s9, s11, s9
+; VI-SDAG-NEXT: s_cmp_lg_u32 s9, s8
+; VI-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; VI-SDAG-NEXT: s_addk_i32 s10, 0xfc10
+; VI-SDAG-NEXT: s_lshl_b32 s9, s10, 12
+; VI-SDAG-NEXT: s_or_b32 s8, s11, s8
+; VI-SDAG-NEXT: s_or_b32 s9, s7, s9
+; VI-SDAG-NEXT: s_cmp_lt_i32 s10, 1
+; VI-SDAG-NEXT: s_cselect_b32 s8, s8, s9
+; VI-SDAG-NEXT: s_and_b32 s9, s8, 7
+; VI-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; VI-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; VI-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; VI-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; VI-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; VI-SDAG-NEXT: s_or_b32 s9, s9, s11
+; VI-SDAG-NEXT: s_add_i32 s8, s8, s9
+; VI-SDAG-NEXT: s_cmp_lt_i32 s10, 31
+; VI-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; VI-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; VI-SDAG-NEXT: s_cselect_b32 s6, s6, 0x7c00
+; VI-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f
+; VI-SDAG-NEXT: s_cselect_b32 s6, s6, s8
+; VI-SDAG-NEXT: s_lshr_b32 s5, s5, 16
+; VI-SDAG-NEXT: s_and_b32 s5, s5, 0x8000
+; VI-SDAG-NEXT: s_or_b32 s5, s5, s6
+; VI-SDAG-NEXT: s_and_b32 s5, s5, 0xffff
+; VI-SDAG-NEXT: s_or_b32 s4, s5, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
-; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
+; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f16_afn:
; VI-GISEL: ; %bb.0: ; %entry
; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -702,29 +3265,109 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-GISEL-NEXT: s_endpgm
;
-; GFX9-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX9-SDAG-LABEL: fptrunc_v2f64_to_v2f16_afn:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s7, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX9-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s4, s10
+; GFX9-SDAG-NEXT: s_mov_b32 s5, s11
+; GFX9-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s8
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s9
+; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7e00
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
-; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v2
-; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX9-SDAG-NEXT: s_and_b32 s7, s5, 0x1ff
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s6, v1
+; GFX9-SDAG-NEXT: v_or_b32_e32 v1, s7, v2
+; GFX9-SDAG-NEXT: s_lshr_b32 s8, s5, 8
+; GFX9-SDAG-NEXT: s_bfe_u32 s9, s5, 0xb0014
+; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX9-SDAG-NEXT: s_and_b32 s7, s8, 0xffe
+; GFX9-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s9
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-SDAG-NEXT: v_med3_i32 v2, s8, 0, 13
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s8, v1
+; GFX9-SDAG-NEXT: s_or_b32 s7, s7, s8
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s10, v2
+; GFX9-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; GFX9-SDAG-NEXT: s_lshr_b32 s11, s8, s10
+; GFX9-SDAG-NEXT: s_lshl_b32 s10, s11, s10
+; GFX9-SDAG-NEXT: s_cmp_lg_u32 s10, s8
+; GFX9-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; GFX9-SDAG-NEXT: s_addk_i32 s9, 0xfc10
+; GFX9-SDAG-NEXT: s_lshl_b32 s10, s9, 12
+; GFX9-SDAG-NEXT: s_or_b32 s8, s11, s8
+; GFX9-SDAG-NEXT: s_or_b32 s10, s7, s10
+; GFX9-SDAG-NEXT: s_cmp_lt_i32 s9, 1
+; GFX9-SDAG-NEXT: s_cselect_b32 s8, s8, s10
+; GFX9-SDAG-NEXT: s_and_b32 s10, s8, 7
+; GFX9-SDAG-NEXT: s_cmp_gt_i32 s10, 5
+; GFX9-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; GFX9-SDAG-NEXT: s_cmp_eq_u32 s10, 3
+; GFX9-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; GFX9-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; GFX9-SDAG-NEXT: s_or_b32 s10, s10, s11
+; GFX9-SDAG-NEXT: s_add_i32 s8, s8, s10
+; GFX9-SDAG-NEXT: s_cmp_lt_i32 s9, 31
+; GFX9-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; GFX9-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; GFX9-SDAG-NEXT: s_cselect_b32 s7, s4, 0x7c00
+; GFX9-SDAG-NEXT: s_cmpk_eq_i32 s9, 0x40f
+; GFX9-SDAG-NEXT: s_cselect_b32 s7, s7, s8
+; GFX9-SDAG-NEXT: s_and_b32 s8, s6, 0x1ff
+; GFX9-SDAG-NEXT: v_or_b32_e32 v0, s8, v0
+; GFX9-SDAG-NEXT: s_lshr_b32 s5, s5, 16
+; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-SDAG-NEXT: s_lshr_b32 s9, s6, 8
+; GFX9-SDAG-NEXT: s_bfe_u32 s10, s6, 0xb0014
+; GFX9-SDAG-NEXT: s_and_b32 s5, s5, 0x8000
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-SDAG-NEXT: s_and_b32 s8, s9, 0xffe
+; GFX9-SDAG-NEXT: s_sub_i32 s9, 0x3f1, s10
+; GFX9-SDAG-NEXT: s_or_b32 s5, s5, s7
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; GFX9-SDAG-NEXT: v_med3_i32 v1, s9, 0, 13
+; GFX9-SDAG-NEXT: s_or_b32 s7, s8, s7
+; GFX9-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; GFX9-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; GFX9-SDAG-NEXT: s_lshr_b32 s11, s8, s9
+; GFX9-SDAG-NEXT: s_lshl_b32 s9, s11, s9
+; GFX9-SDAG-NEXT: s_cmp_lg_u32 s9, s8
+; GFX9-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; GFX9-SDAG-NEXT: s_addk_i32 s10, 0xfc10
+; GFX9-SDAG-NEXT: s_lshl_b32 s9, s10, 12
+; GFX9-SDAG-NEXT: s_or_b32 s8, s11, s8
+; GFX9-SDAG-NEXT: s_or_b32 s9, s7, s9
+; GFX9-SDAG-NEXT: s_cmp_lt_i32 s10, 1
+; GFX9-SDAG-NEXT: s_cselect_b32 s8, s8, s9
+; GFX9-SDAG-NEXT: s_and_b32 s9, s8, 7
+; GFX9-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; GFX9-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; GFX9-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; GFX9-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; GFX9-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; GFX9-SDAG-NEXT: s_or_b32 s9, s9, s11
+; GFX9-SDAG-NEXT: s_add_i32 s8, s8, s9
+; GFX9-SDAG-NEXT: s_cmp_lt_i32 s10, 31
+; GFX9-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; GFX9-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; GFX9-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00
+; GFX9-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f
+; GFX9-SDAG-NEXT: s_cselect_b32 s4, s4, s8
+; GFX9-SDAG-NEXT: s_lshr_b32 s6, s6, 16
+; GFX9-SDAG-NEXT: s_and_b32 s6, s6, 0x8000
+; GFX9-SDAG-NEXT: s_or_b32 s4, s6, s4
+; GFX9-SDAG-NEXT: s_pack_ll_b32_b16 s4, s4, s5
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
-; GFX9-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX9-GISEL-LABEL: fptrunc_v2f64_to_v2f16_afn:
; GFX9-GISEL: ; %bb.0: ; %entry
; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -740,27 +3383,109 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-GISEL-NEXT: s_endpgm
;
-; GFX950-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX950-SDAG-LABEL: fptrunc_v2f64_to_v2f16_afn:
; GFX950-SDAG: ; %bb.0: ; %entry
-; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX950-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX950-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX950-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX950-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX950-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX950-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX950-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX950-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX950-SDAG-NEXT: s_mov_b32 s7, s3
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX950-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX950-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX950-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX950-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX950-SDAG-NEXT: s_mov_b32 s4, s10
+; GFX950-SDAG-NEXT: s_mov_b32 s5, s11
+; GFX950-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX950-SDAG-NEXT: s_mov_b32 s0, s8
+; GFX950-SDAG-NEXT: s_mov_b32 s1, s9
+; GFX950-SDAG-NEXT: s_movk_i32 s4, 0x7e00
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX950-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
-; GFX950-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v0, v0, v2
-; GFX950-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s5, v3
+; GFX950-SDAG-NEXT: s_and_b32 s7, s5, 0x1ff
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s6, v1
+; GFX950-SDAG-NEXT: v_or_b32_e32 v1, s7, v2
+; GFX950-SDAG-NEXT: s_lshr_b32 s8, s5, 8
+; GFX950-SDAG-NEXT: s_bfe_u32 s9, s5, 0xb0014
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX950-SDAG-NEXT: s_and_b32 s7, s8, 0xffe
+; GFX950-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s9
+; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX950-SDAG-NEXT: v_med3_i32 v2, s8, 0, 13
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s8, v1
+; GFX950-SDAG-NEXT: s_or_b32 s7, s7, s8
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s10, v2
+; GFX950-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; GFX950-SDAG-NEXT: s_lshr_b32 s11, s8, s10
+; GFX950-SDAG-NEXT: s_lshl_b32 s10, s11, s10
+; GFX950-SDAG-NEXT: s_cmp_lg_u32 s10, s8
+; GFX950-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; GFX950-SDAG-NEXT: s_addk_i32 s9, 0xfc10
+; GFX950-SDAG-NEXT: s_lshl_b32 s10, s9, 12
+; GFX950-SDAG-NEXT: s_or_b32 s8, s11, s8
+; GFX950-SDAG-NEXT: s_or_b32 s10, s7, s10
+; GFX950-SDAG-NEXT: s_cmp_lt_i32 s9, 1
+; GFX950-SDAG-NEXT: s_cselect_b32 s8, s8, s10
+; GFX950-SDAG-NEXT: s_and_b32 s10, s8, 7
+; GFX950-SDAG-NEXT: s_cmp_gt_i32 s10, 5
+; GFX950-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; GFX950-SDAG-NEXT: s_cmp_eq_u32 s10, 3
+; GFX950-SDAG-NEXT: s_cselect_b32 s10, 1, 0
+; GFX950-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; GFX950-SDAG-NEXT: s_or_b32 s10, s10, s11
+; GFX950-SDAG-NEXT: s_add_i32 s8, s8, s10
+; GFX950-SDAG-NEXT: s_cmp_lt_i32 s9, 31
+; GFX950-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; GFX950-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; GFX950-SDAG-NEXT: s_cselect_b32 s7, s4, 0x7c00
+; GFX950-SDAG-NEXT: s_cmpk_eq_i32 s9, 0x40f
+; GFX950-SDAG-NEXT: s_cselect_b32 s7, s7, s8
+; GFX950-SDAG-NEXT: s_and_b32 s8, s6, 0x1ff
+; GFX950-SDAG-NEXT: v_or_b32_e32 v0, s8, v0
+; GFX950-SDAG-NEXT: s_lshr_b32 s5, s5, 16
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX950-SDAG-NEXT: s_lshr_b32 s9, s6, 8
+; GFX950-SDAG-NEXT: s_bfe_u32 s10, s6, 0xb0014
+; GFX950-SDAG-NEXT: s_and_b32 s5, s5, 0x8000
+; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-SDAG-NEXT: s_and_b32 s8, s9, 0xffe
+; GFX950-SDAG-NEXT: s_sub_i32 s9, 0x3f1, s10
+; GFX950-SDAG-NEXT: s_or_b32 s5, s5, s7
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s7, v0
+; GFX950-SDAG-NEXT: v_med3_i32 v1, s9, 0, 13
+; GFX950-SDAG-NEXT: s_or_b32 s7, s8, s7
+; GFX950-SDAG-NEXT: v_readfirstlane_b32 s9, v1
+; GFX950-SDAG-NEXT: s_or_b32 s8, s7, 0x1000
+; GFX950-SDAG-NEXT: s_lshr_b32 s11, s8, s9
+; GFX950-SDAG-NEXT: s_lshl_b32 s9, s11, s9
+; GFX950-SDAG-NEXT: s_cmp_lg_u32 s9, s8
+; GFX950-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; GFX950-SDAG-NEXT: s_addk_i32 s10, 0xfc10
+; GFX950-SDAG-NEXT: s_lshl_b32 s9, s10, 12
+; GFX950-SDAG-NEXT: s_or_b32 s8, s11, s8
+; GFX950-SDAG-NEXT: s_or_b32 s9, s7, s9
+; GFX950-SDAG-NEXT: s_cmp_lt_i32 s10, 1
+; GFX950-SDAG-NEXT: s_cselect_b32 s8, s8, s9
+; GFX950-SDAG-NEXT: s_and_b32 s9, s8, 7
+; GFX950-SDAG-NEXT: s_cmp_gt_i32 s9, 5
+; GFX950-SDAG-NEXT: s_cselect_b32 s11, 1, 0
+; GFX950-SDAG-NEXT: s_cmp_eq_u32 s9, 3
+; GFX950-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; GFX950-SDAG-NEXT: s_lshr_b32 s8, s8, 2
+; GFX950-SDAG-NEXT: s_or_b32 s9, s9, s11
+; GFX950-SDAG-NEXT: s_add_i32 s8, s8, s9
+; GFX950-SDAG-NEXT: s_cmp_lt_i32 s10, 31
+; GFX950-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; GFX950-SDAG-NEXT: s_cmp_lg_u32 s7, 0
+; GFX950-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00
+; GFX950-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f
+; GFX950-SDAG-NEXT: s_cselect_b32 s4, s4, s8
+; GFX950-SDAG-NEXT: s_lshr_b32 s6, s6, 16
+; GFX950-SDAG-NEXT: s_and_b32 s6, s6, 0x8000
+; GFX950-SDAG-NEXT: s_or_b32 s4, s6, s4
+; GFX950-SDAG-NEXT: s_pack_ll_b32_b16 s4, s4, s5
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX950-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX950-SDAG-NEXT: s_endpgm
;
-; GFX950-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX950-GISEL-LABEL: fptrunc_v2f64_to_v2f16_afn:
; GFX950-GISEL: ; %bb.0: ; %entry
; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -776,7 +3501,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX950-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX950-GISEL-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX11-SDAG-TRUE16-LABEL: fptrunc_v2f64_to_v2f16_afn:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1
@@ -786,21 +3511,113 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2
; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0
; GFX11-SDAG-TRUE16-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0
-; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
-; GFX11-SDAG-TRUE16-NEXT: v_cvt_f32_f64_e32 v1, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v2
-; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s2, v3
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s3, s2, 0x1ff
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s5, s2, 8
+; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, s3, v2
+; GFX11-SDAG-TRUE16-NEXT: s_bfe_u32 s3, s2, 0xb0014
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-SDAG-TRUE16-NEXT: s_sub_i32 s4, 0x3f1, s3
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, s4, 0, 13
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s8, v3
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s4, v2
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, s4, 0x1000
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s9, s5, s8
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s8, s9, s8
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s8, s5
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_addk_i32 s3, 0xfc10
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, s9, s5
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s8, s3, 12
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s8, s4, s8
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s3, 1
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, s5, s8
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s8, s5, 7
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_gt_i32 s8, 5
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s9, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_eq_u32 s8, 3
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s8, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s5, s5, 2
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_add_i32 s5, s5, s8
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s3, 31
+; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s8, 0x7e00
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s9, s8, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT: s_cmpk_eq_i32 s3, 0x40f
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s9, s5
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s5, s4, 0x1ff
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s10, s4, 8
+; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, s5, v0
+; GFX11-SDAG-TRUE16-NEXT: s_bfe_u32 s5, s4, 0xb0014
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s10, s10, 0xffe
+; GFX11-SDAG-TRUE16-NEXT: s_sub_i32 s9, 0x3f1, s5
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v1, s9, 0, 13
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s2, s2, 0x8000
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s11, v1
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s9, v0
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s9, s10, s9
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s10, s9, 0x1000
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s12, s10, s11
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s11, s12, s11
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s11, s10
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_addk_i32 s5, 0xfc10
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s3, s12, s3
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s10, s5, 12
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s10, s9, s10
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s5, 1
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s3, s10
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s10, s3, 7
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_gt_i32 s10, 5
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s11, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_eq_u32 s10, 3
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s10, 1, 0
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s3, s3, 2
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s10, s10, s11
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_add_i32 s3, s3, s10
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s5, 31
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s3, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s9, 0
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT: s_cmpk_eq_i32 s5, 0x40f
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s8, s3
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s4, s4, 16
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s4, s4, 0x8000
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s3, s4, s3
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s3, s2
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, s2
; GFX11-SDAG-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-SDAG-TRUE16-NEXT: s_endpgm
;
-; GFX11-SDAG-FAKE16-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX11-SDAG-FAKE16-LABEL: fptrunc_v2f64_to_v2f16_afn:
; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry
; GFX11-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1
@@ -810,21 +3627,113 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s8, s2
; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0
; GFX11-SDAG-FAKE16-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0
-; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
-; GFX11-SDAG-FAKE16-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v2
-; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s2, v3
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s3, s2, 0x1ff
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s5, s2, 8
+; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v2, s3, v2
+; GFX11-SDAG-FAKE16-NEXT: s_bfe_u32 s3, s2, 0xb0014
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-SDAG-FAKE16-NEXT: s_sub_i32 s4, 0x3f1, s3
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v3, s4, 0, 13
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s8, v3
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s4, v2
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s4, s5, s4
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s5, s4, 0x1000
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s9, s5, s8
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s8, s9, s8
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s8, s5
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_addk_i32 s3, 0xfc10
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s5, s9, s5
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s8, s3, 12
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s8, s4, s8
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s3, 1
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, s5, s8
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s8, s5, 7
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_gt_i32 s8, 5
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s9, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_eq_u32 s8, 3
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s8, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s5, s5, 2
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s8, s8, s9
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_add_i32 s5, s5, s8
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s3, 31
+; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s8, 0x7e00
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s9, s8, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT: s_cmpk_eq_i32 s3, 0x40f
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s9, s5
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s5, s4, 0x1ff
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s10, s4, 8
+; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, s5, v0
+; GFX11-SDAG-FAKE16-NEXT: s_bfe_u32 s5, s4, 0xb0014
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s10, s10, 0xffe
+; GFX11-SDAG-FAKE16-NEXT: s_sub_i32 s9, 0x3f1, s5
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v1, s9, 0, 13
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s2, s2, 0x8000
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s2, s2, s3
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s11, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s9, v0
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s9, s10, s9
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s10, s9, 0x1000
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s12, s10, s11
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s11, s12, s11
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s11, s10
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_addk_i32 s5, 0xfc10
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s3, s12, s3
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s10, s5, 12
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s10, s9, s10
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s5, 1
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s3, s10
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s10, s3, 7
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_gt_i32 s10, 5
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s11, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_eq_u32 s10, 3
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s10, 1, 0
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s3, s3, 2
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s10, s10, s11
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_add_i32 s3, s3, s10
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s5, 31
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s3, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s9, 0
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s8, s8, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT: s_cmpk_eq_i32 s5, 0x40f
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s8, s3
+; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s4, s4, 16
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s4, s4, 0x8000
+; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s3, s4, s3
+; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s3, s2
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, s2
; GFX11-SDAG-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-SDAG-FAKE16-NEXT: s_endpgm
;
-; GFX11-GISEL-TRUE16-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX11-GISEL-TRUE16-LABEL: fptrunc_v2f64_to_v2f16_afn:
; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry
; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
@@ -842,7 +3751,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-GISEL-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-GISEL-TRUE16-NEXT: s_endpgm
;
-; GFX11-GISEL-FAKE16-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX11-GISEL-FAKE16-LABEL: fptrunc_v2f64_to_v2f16_afn:
; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry
; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
@@ -863,7 +3772,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
ptr addrspace(1) %a) {
entry:
%a.val = load <2 x double>, ptr addrspace(1) %a
- %r.val = fptrunc <2 x double> %a.val to <2 x half>
+ %r.val = fptrunc afn <2 x double> %a.val to <2 x half>
store <2 x half> %r.val, ptr addrspace(1) %r
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 2bd3659..4f8eab1 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -3,17 +3,15 @@
; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-SDAG,VI-SAFE-SDAG %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-GISEL,VI-SAFE-GISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI-SDAG,VI-UNSAFE-SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI-GISEL,VI-UNSAFE-GISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-SAFE-SDAG %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10-GISEL,GFX10-SAFE-GISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-UNSAFE-SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=1 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX10-GISEL,GFX10-UNSAFE-GISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-SAFE-SDAG %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-SAFE-GISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-FAKE16 %s
define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) {
; SI-LABEL: fptrunc_f64_to_f32:
@@ -94,6 +92,85 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in)
ret void
}
+define amdgpu_kernel void @fptrunc_f64_to_f32_afn(ptr addrspace(1) %out, double %in) {
+; SI-LABEL: fptrunc_f64_to_f32_afn:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-SDAG-LABEL: fptrunc_f64_to_f32_afn:
+; VI-SDAG: ; %bb.0:
+; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s6, -1
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; VI-SDAG-NEXT: s_mov_b32 s4, s0
+; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-SDAG-NEXT: s_endpgm
+;
+; VI-GISEL-LABEL: fptrunc_f64_to_f32_afn:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; VI-GISEL-NEXT: s_mov_b32 s2, -1
+; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-GISEL-NEXT: s_endpgm
+;
+; GFX10-SDAG-LABEL: fptrunc_f64_to_f32_afn:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX10-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: fptrunc_f64_to_f32_afn:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: fptrunc_f64_to_f32_afn:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: fptrunc_f64_to_f32_afn:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: s_endpgm
+ %result = fptrunc afn double %in to float
+ store float %result, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) {
; SI-LABEL: fptrunc_f64_to_f16:
; SI: ; %bb.0:
@@ -203,56 +280,56 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; VI-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-SAFE-SDAG-NEXT: s_endpgm
;
-; VI-SAFE-GISEL-LABEL: fptrunc_f64_to_f16:
-; VI-SAFE-GISEL: ; %bb.0:
-; VI-SAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
-; VI-SAFE-GISEL-NEXT: s_lshr_b32 s5, s3, 8
-; VI-SAFE-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
-; VI-SAFE-GISEL-NEXT: s_addk_i32 s4, 0xfc10
-; VI-SAFE-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
-; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s6, s2
-; VI-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s5, s2
-; VI-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s5, 1, 0
-; VI-SAFE-GISEL-NEXT: s_sub_i32 s7, 1, s4
-; VI-SAFE-GISEL-NEXT: s_lshl_b32 s6, s4, 12
-; VI-SAFE-GISEL-NEXT: s_max_i32 s7, s7, 0
-; VI-SAFE-GISEL-NEXT: s_or_b32 s6, s2, s6
-; VI-SAFE-GISEL-NEXT: s_min_i32 s7, s7, 13
-; VI-SAFE-GISEL-NEXT: s_bitset1_b32 s2, 12
-; VI-SAFE-GISEL-NEXT: s_lshl_b32 s5, s5, 9
-; VI-SAFE-GISEL-NEXT: s_lshr_b32 s8, s2, s7
-; VI-SAFE-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
-; VI-SAFE-GISEL-NEXT: s_lshl_b32 s7, s8, s7
-; VI-SAFE-GISEL-NEXT: s_cmp_lg_u32 s7, s2
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s8, s2
-; VI-SAFE-GISEL-NEXT: s_cmp_lt_i32 s4, 1
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, s2, s6
-; VI-SAFE-GISEL-NEXT: s_and_b32 s6, s2, 7
-; VI-SAFE-GISEL-NEXT: s_lshr_b32 s2, s2, 2
-; VI-SAFE-GISEL-NEXT: s_cmp_eq_u32 s6, 3
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s7, 1, 0
-; VI-SAFE-GISEL-NEXT: s_cmp_gt_i32 s6, 5
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0
-; VI-SAFE-GISEL-NEXT: s_or_b32 s6, s7, s6
-; VI-SAFE-GISEL-NEXT: s_add_i32 s2, s2, s6
-; VI-SAFE-GISEL-NEXT: s_cmp_gt_i32 s4, 30
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
-; VI-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, s5, s2
-; VI-SAFE-GISEL-NEXT: s_lshr_b32 s3, s3, 16
-; VI-SAFE-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
-; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s3, s2
-; VI-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-SAFE-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-SAFE-GISEL-NEXT: s_mov_b32 s3, 0xf000
-; VI-SAFE-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
-; VI-SAFE-GISEL-NEXT: s_endpgm
+; VI-GISEL-LABEL: fptrunc_f64_to_f16:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
+; VI-GISEL-NEXT: s_lshr_b32 s5, s3, 8
+; VI-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
+; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10
+; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
+; VI-GISEL-NEXT: s_or_b32 s2, s6, s2
+; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; VI-GISEL-NEXT: s_or_b32 s2, s5, s2
+; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
+; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4
+; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12
+; VI-GISEL-NEXT: s_max_i32 s7, s7, 0
+; VI-GISEL-NEXT: s_or_b32 s6, s2, s6
+; VI-GISEL-NEXT: s_min_i32 s7, s7, 13
+; VI-GISEL-NEXT: s_bitset1_b32 s2, 12
+; VI-GISEL-NEXT: s_lshl_b32 s5, s5, 9
+; VI-GISEL-NEXT: s_lshr_b32 s8, s2, s7
+; VI-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
+; VI-GISEL-NEXT: s_lshl_b32 s7, s8, s7
+; VI-GISEL-NEXT: s_cmp_lg_u32 s7, s2
+; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; VI-GISEL-NEXT: s_or_b32 s2, s8, s2
+; VI-GISEL-NEXT: s_cmp_lt_i32 s4, 1
+; VI-GISEL-NEXT: s_cselect_b32 s2, s2, s6
+; VI-GISEL-NEXT: s_and_b32 s6, s2, 7
+; VI-GISEL-NEXT: s_lshr_b32 s2, s2, 2
+; VI-GISEL-NEXT: s_cmp_eq_u32 s6, 3
+; VI-GISEL-NEXT: s_cselect_b32 s7, 1, 0
+; VI-GISEL-NEXT: s_cmp_gt_i32 s6, 5
+; VI-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; VI-GISEL-NEXT: s_or_b32 s6, s7, s6
+; VI-GISEL-NEXT: s_add_i32 s2, s2, s6
+; VI-GISEL-NEXT: s_cmp_gt_i32 s4, 30
+; VI-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
+; VI-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
+; VI-GISEL-NEXT: s_cselect_b32 s2, s5, s2
+; VI-GISEL-NEXT: s_lshr_b32 s3, s3, 16
+; VI-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
+; VI-GISEL-NEXT: s_or_b32 s2, s3, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT: s_mov_b32 s2, -1
+; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-GISEL-NEXT: s_endpgm
;
; VI-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16:
; VI-UNSAFE-SDAG: ; %bb.0:
@@ -265,17 +342,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; VI-UNSAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-UNSAFE-SDAG-NEXT: s_endpgm
;
-; VI-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16:
-; VI-UNSAFE-GISEL: ; %bb.0:
-; VI-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-UNSAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; VI-UNSAFE-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-UNSAFE-GISEL-NEXT: s_mov_b32 s3, 0xf000
-; VI-UNSAFE-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-UNSAFE-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
-; VI-UNSAFE-GISEL-NEXT: s_endpgm
-;
; GFX10-SAFE-SDAG-LABEL: fptrunc_f64_to_f16:
; GFX10-SAFE-SDAG: ; %bb.0:
; GFX10-SAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -328,56 +394,56 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX10-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX10-SAFE-SDAG-NEXT: s_endpgm
;
-; GFX10-SAFE-GISEL-LABEL: fptrunc_f64_to_f16:
-; GFX10-SAFE-GISEL: ; %bb.0:
-; GFX10-SAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SAFE-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
-; GFX10-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
-; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s5, s3, 8
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s6, s2
-; GFX10-SAFE-GISEL-NEXT: s_addk_i32 s4, 0xfc10
-; GFX10-SAFE-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX10-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s5, s2
-; GFX10-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s5, 1, 0
-; GFX10-SAFE-GISEL-NEXT: s_sub_i32 s6, 1, s4
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s8, s2, 0x1000
-; GFX10-SAFE-GISEL-NEXT: s_max_i32 s6, s6, 0
-; GFX10-SAFE-GISEL-NEXT: s_lshl_b32 s7, s4, 12
-; GFX10-SAFE-GISEL-NEXT: s_min_i32 s6, s6, 13
-; GFX10-SAFE-GISEL-NEXT: s_lshl_b32 s5, s5, 9
-; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s9, s8, s6
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s2, s7
-; GFX10-SAFE-GISEL-NEXT: s_lshl_b32 s6, s9, s6
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
-; GFX10-SAFE-GISEL-NEXT: s_cmp_lg_u32 s6, s8
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s6, s9, s6
-; GFX10-SAFE-GISEL-NEXT: s_cmp_lt_i32 s4, 1
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, s6, s2
-; GFX10-SAFE-GISEL-NEXT: s_and_b32 s6, s2, 7
-; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s2, s2, 2
-; GFX10-SAFE-GISEL-NEXT: s_cmp_eq_u32 s6, 3
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s7, 1, 0
-; GFX10-SAFE-GISEL-NEXT: s_cmp_gt_i32 s6, 5
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s6, s7, s6
-; GFX10-SAFE-GISEL-NEXT: s_add_i32 s2, s2, s6
-; GFX10-SAFE-GISEL-NEXT: s_cmp_gt_i32 s4, 30
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
-; GFX10-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, s5, s2
-; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s3, s3, 16
-; GFX10-SAFE-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s3, s2
-; GFX10-SAFE-GISEL-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-SAFE-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX10-SAFE-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
-; GFX10-SAFE-GISEL-NEXT: s_endpgm
+; GFX10-GISEL-LABEL: fptrunc_f64_to_f16:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX10-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
+; GFX10-GISEL-NEXT: s_lshr_b32 s5, s3, 8
+; GFX10-GISEL-NEXT: s_or_b32 s2, s6, s2
+; GFX10-GISEL-NEXT: s_addk_i32 s4, 0xfc10
+; GFX10-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX10-GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GFX10-GISEL-NEXT: s_or_b32 s2, s5, s2
+; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX10-GISEL-NEXT: s_cselect_b32 s5, 1, 0
+; GFX10-GISEL-NEXT: s_sub_i32 s6, 1, s4
+; GFX10-GISEL-NEXT: s_or_b32 s8, s2, 0x1000
+; GFX10-GISEL-NEXT: s_max_i32 s6, s6, 0
+; GFX10-GISEL-NEXT: s_lshl_b32 s7, s4, 12
+; GFX10-GISEL-NEXT: s_min_i32 s6, s6, 13
+; GFX10-GISEL-NEXT: s_lshl_b32 s5, s5, 9
+; GFX10-GISEL-NEXT: s_lshr_b32 s9, s8, s6
+; GFX10-GISEL-NEXT: s_or_b32 s2, s2, s7
+; GFX10-GISEL-NEXT: s_lshl_b32 s6, s9, s6
+; GFX10-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
+; GFX10-GISEL-NEXT: s_cmp_lg_u32 s6, s8
+; GFX10-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; GFX10-GISEL-NEXT: s_or_b32 s6, s9, s6
+; GFX10-GISEL-NEXT: s_cmp_lt_i32 s4, 1
+; GFX10-GISEL-NEXT: s_cselect_b32 s2, s6, s2
+; GFX10-GISEL-NEXT: s_and_b32 s6, s2, 7
+; GFX10-GISEL-NEXT: s_lshr_b32 s2, s2, 2
+; GFX10-GISEL-NEXT: s_cmp_eq_u32 s6, 3
+; GFX10-GISEL-NEXT: s_cselect_b32 s7, 1, 0
+; GFX10-GISEL-NEXT: s_cmp_gt_i32 s6, 5
+; GFX10-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; GFX10-GISEL-NEXT: s_or_b32 s6, s7, s6
+; GFX10-GISEL-NEXT: s_add_i32 s2, s2, s6
+; GFX10-GISEL-NEXT: s_cmp_gt_i32 s4, 30
+; GFX10-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
+; GFX10-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
+; GFX10-GISEL-NEXT: s_cselect_b32 s2, s5, s2
+; GFX10-GISEL-NEXT: s_lshr_b32 s3, s3, 16
+; GFX10-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
+; GFX10-GISEL-NEXT: s_or_b32 s2, s3, s2
+; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX10-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX10-GISEL-NEXT: s_endpgm
;
; GFX10-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16:
; GFX10-UNSAFE-SDAG: ; %bb.0:
@@ -390,17 +456,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX10-UNSAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX10-UNSAFE-SDAG-NEXT: s_endpgm
;
-; GFX10-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16:
-; GFX10-UNSAFE-GISEL: ; %bb.0:
-; GFX10-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-UNSAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX10-UNSAFE-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX10-UNSAFE-GISEL-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-UNSAFE-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX10-UNSAFE-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
-; GFX10-UNSAFE-GISEL-NEXT: s_endpgm
-;
; GFX11-SAFE-SDAG-LABEL: fptrunc_f64_to_f16:
; GFX11-SAFE-SDAG: ; %bb.0:
; GFX11-SAFE-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -461,62 +516,368 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-SAFE-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-SAFE-SDAG-NEXT: s_endpgm
;
-; GFX11-SAFE-GISEL-LABEL: fptrunc_f64_to_f16:
+; GFX11-GISEL-LABEL: fptrunc_f64_to_f16:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX11-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
+; GFX11-GISEL-NEXT: s_lshr_b32 s5, s3, 8
+; GFX11-GISEL-NEXT: s_or_b32 s2, s6, s2
+; GFX11-GISEL-NEXT: s_addk_i32 s4, 0xfc10
+; GFX11-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_or_b32 s2, s5, s2
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-GISEL-NEXT: s_sub_i32 s6, 1, s4
+; GFX11-GISEL-NEXT: s_or_b32 s8, s2, 0x1000
+; GFX11-GISEL-NEXT: s_max_i32 s6, s6, 0
+; GFX11-GISEL-NEXT: s_lshl_b32 s7, s4, 12
+; GFX11-GISEL-NEXT: s_min_i32 s6, s6, 13
+; GFX11-GISEL-NEXT: s_lshl_b32 s5, s5, 9
+; GFX11-GISEL-NEXT: s_lshr_b32 s9, s8, s6
+; GFX11-GISEL-NEXT: s_or_b32 s2, s2, s7
+; GFX11-GISEL-NEXT: s_lshl_b32 s6, s9, s6
+; GFX11-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
+; GFX11-GISEL-NEXT: s_cmp_lg_u32 s6, s8
+; GFX11-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_or_b32 s6, s9, s6
+; GFX11-GISEL-NEXT: s_cmp_lt_i32 s4, 1
+; GFX11-GISEL-NEXT: s_cselect_b32 s2, s6, s2
+; GFX11-GISEL-NEXT: s_and_b32 s6, s2, 7
+; GFX11-GISEL-NEXT: s_lshr_b32 s2, s2, 2
+; GFX11-GISEL-NEXT: s_cmp_eq_u32 s6, 3
+; GFX11-GISEL-NEXT: s_cselect_b32 s7, 1, 0
+; GFX11-GISEL-NEXT: s_cmp_gt_i32 s6, 5
+; GFX11-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_or_b32 s6, s7, s6
+; GFX11-GISEL-NEXT: s_add_i32 s2, s2, s6
+; GFX11-GISEL-NEXT: s_cmp_gt_i32 s4, 30
+; GFX11-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
+; GFX11-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
+; GFX11-GISEL-NEXT: s_cselect_b32 s2, s5, s2
+; GFX11-GISEL-NEXT: s_lshr_b32 s3, s3, 16
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
+; GFX11-GISEL-NEXT: s_or_b32 s2, s3, s2
+; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX11-UNSAFE-DAG-TRUE16-LABEL: fptrunc_f64_to_f16:
+; GFX11-UNSAFE-DAG-TRUE16: ; %bb.0:
+; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-UNSAFE-DAG-TRUE16-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_mov_b32 s2, -1
+; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-UNSAFE-DAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-UNSAFE-DAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_endpgm
+;
+; GFX11-UNSAFE-DAG-FAKE16-LABEL: fptrunc_f64_to_f16:
+; GFX11-UNSAFE-DAG-FAKE16: ; %bb.0:
+; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-UNSAFE-DAG-FAKE16-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-UNSAFE-DAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-UNSAFE-DAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_endpgm
+ %result = fptrunc double %in to half
+ %result_i16 = bitcast half %result to i16
+ store i16 %result_i16, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @fptrunc_f64_to_f16_afn(ptr addrspace(1) %out, double %in) {
+; SI-LABEL: fptrunc_f64_to_f16_afn:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_movk_i32 s2, 0x7e00
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_lshr_b32 s0, s7, 8
+; SI-NEXT: s_and_b32 s1, s7, 0x1ff
+; SI-NEXT: s_and_b32 s8, s0, 0xffe
+; SI-NEXT: s_or_b32 s0, s1, s6
+; SI-NEXT: s_cmp_lg_u32 s0, 0
+; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014
+; SI-NEXT: v_readfirstlane_b32 s1, v0
+; SI-NEXT: s_sub_i32 s6, 0x3f1, s0
+; SI-NEXT: s_or_b32 s1, s8, s1
+; SI-NEXT: v_med3_i32 v0, s6, 0, 13
+; SI-NEXT: s_or_b32 s6, s1, 0x1000
+; SI-NEXT: v_readfirstlane_b32 s8, v0
+; SI-NEXT: s_lshr_b32 s9, s6, s8
+; SI-NEXT: s_lshl_b32 s8, s9, s8
+; SI-NEXT: s_cmp_lg_u32 s8, s6
+; SI-NEXT: s_cselect_b32 s6, 1, 0
+; SI-NEXT: s_addk_i32 s0, 0xfc10
+; SI-NEXT: s_or_b32 s6, s9, s6
+; SI-NEXT: s_lshl_b32 s8, s0, 12
+; SI-NEXT: s_or_b32 s8, s1, s8
+; SI-NEXT: s_cmp_lt_i32 s0, 1
+; SI-NEXT: s_cselect_b32 s6, s6, s8
+; SI-NEXT: s_and_b32 s8, s6, 7
+; SI-NEXT: s_cmp_gt_i32 s8, 5
+; SI-NEXT: s_cselect_b32 s9, 1, 0
+; SI-NEXT: s_cmp_eq_u32 s8, 3
+; SI-NEXT: s_cselect_b32 s8, 1, 0
+; SI-NEXT: s_lshr_b32 s6, s6, 2
+; SI-NEXT: s_or_b32 s8, s8, s9
+; SI-NEXT: s_add_i32 s6, s6, s8
+; SI-NEXT: s_cmp_lt_i32 s0, 31
+; SI-NEXT: s_cselect_b32 s6, s6, 0x7c00
+; SI-NEXT: s_cmp_lg_u32 s1, 0
+; SI-NEXT: s_cselect_b32 s1, s2, 0x7c00
+; SI-NEXT: s_cmpk_eq_i32 s0, 0x40f
+; SI-NEXT: s_cselect_b32 s0, s1, s6
+; SI-NEXT: s_lshr_b32 s1, s7, 16
+; SI-NEXT: s_and_b32 s1, s1, 0x8000
+; SI-NEXT: s_or_b32 s6, s1, s0
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: v_mov_b32_e32 v0, s6
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-SAFE-SDAG-LABEL: fptrunc_f64_to_f16_afn:
+; VI-SAFE-SDAG: ; %bb.0:
+; VI-SAFE-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
+; VI-SAFE-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SAFE-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SAFE-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SAFE-SDAG-NEXT: s_lshr_b32 s4, s7, 8
+; VI-SAFE-SDAG-NEXT: s_and_b32 s8, s4, 0xffe
+; VI-SAFE-SDAG-NEXT: s_and_b32 s4, s7, 0x1ff
+; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s6
+; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0
+; VI-SAFE-SDAG-NEXT: s_mov_b32 s1, s5
+; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0
+; VI-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; VI-SAFE-SDAG-NEXT: v_readfirstlane_b32 s4, v0
+; VI-SAFE-SDAG-NEXT: s_bfe_u32 s6, s7, 0xb0014
+; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s8, s4
+; VI-SAFE-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s6
+; VI-SAFE-SDAG-NEXT: v_med3_i32 v0, s8, 0, 13
+; VI-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000
+; VI-SAFE-SDAG-NEXT: v_readfirstlane_b32 s8, v0
+; VI-SAFE-SDAG-NEXT: s_lshr_b32 s9, s5, s8
+; VI-SAFE-SDAG-NEXT: s_lshl_b32 s8, s9, s8
+; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s8, s5
+; VI-SAFE-SDAG-NEXT: s_cselect_b32 s5, 1, 0
+; VI-SAFE-SDAG-NEXT: s_addk_i32 s6, 0xfc10
+; VI-SAFE-SDAG-NEXT: s_lshl_b32 s8, s6, 12
+; VI-SAFE-SDAG-NEXT: s_or_b32 s5, s9, s5
+; VI-SAFE-SDAG-NEXT: s_or_b32 s8, s4, s8
+; VI-SAFE-SDAG-NEXT: s_cmp_lt_i32 s6, 1
+; VI-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s8
+; VI-SAFE-SDAG-NEXT: s_and_b32 s8, s5, 7
+; VI-SAFE-SDAG-NEXT: s_cmp_gt_i32 s8, 5
+; VI-SAFE-SDAG-NEXT: s_cselect_b32 s9, 1, 0
+; VI-SAFE-SDAG-NEXT: s_cmp_eq_u32 s8, 3
+; VI-SAFE-SDAG-NEXT: s_cselect_b32 s8, 1, 0
+; VI-SAFE-SDAG-NEXT: s_or_b32 s8, s8, s9
+; VI-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2
+; VI-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s8
+; VI-SAFE-SDAG-NEXT: s_cmp_lt_i32 s6, 31
+; VI-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0
+; VI-SAFE-SDAG-NEXT: s_movk_i32 s4, 0x7e00
+; VI-SAFE-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00
+; VI-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s6, 0x40f
+; VI-SAFE-SDAG-NEXT: s_cselect_b32 s4, s4, s5
+; VI-SAFE-SDAG-NEXT: s_lshr_b32 s5, s7, 16
+; VI-SAFE-SDAG-NEXT: s_and_b32 s5, s5, 0x8000
+; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s5, s4
+; VI-SAFE-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-SAFE-SDAG-NEXT: s_endpgm
+;
+; VI-GISEL-LABEL: fptrunc_f64_to_f16_afn:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; VI-GISEL-NEXT: s_mov_b32 s2, -1
+; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-GISEL-NEXT: s_endpgm
+;
+; VI-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16_afn:
+; VI-UNSAFE-SDAG: ; %bb.0:
+; VI-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; VI-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-UNSAFE-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-UNSAFE-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; VI-UNSAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-UNSAFE-SDAG-NEXT: s_endpgm
+;
+; GFX10-SAFE-SDAG-LABEL: fptrunc_f64_to_f16_afn:
+; GFX10-SAFE-SDAG: ; %bb.0:
+; GFX10-SAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SAFE-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff
+; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s5, s3, 8
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s2, s4, s2
+; GFX10-SAFE-SDAG-NEXT: s_and_b32 s4, s5, 0xffe
+; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s2, 0
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX10-SAFE-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014
+; GFX10-SAFE-SDAG-NEXT: s_sub_i32 s5, 0x3f1, s2
+; GFX10-SAFE-SDAG-NEXT: v_med3_i32 v1, s5, 0, 13
+; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0
+; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s6, v1
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s5
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000
+; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s7, s5, s6
+; GFX10-SAFE-SDAG-NEXT: s_lshl_b32 s6, s7, s6
+; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, s5
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, 1, 0
+; GFX10-SAFE-SDAG-NEXT: s_addk_i32 s2, 0xfc10
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s5, s7, s5
+; GFX10-SAFE-SDAG-NEXT: s_lshl_b32 s6, s2, 12
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s6, s4, s6
+; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s6
+; GFX10-SAFE-SDAG-NEXT: s_and_b32 s6, s5, 7
+; GFX10-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s7, 1, 0
+; GFX10-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s6, 1, 0
+; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s6, s6, s7
+; GFX10-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s6
+; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 31
+; GFX10-SAFE-SDAG-NEXT: s_movk_i32 s6, 0x7e00
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s4, s6, 0x7c00
+; GFX10-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s2, 0x40f
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s2, s4, s5
+; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s3, s3, 16
+; GFX10-SAFE-SDAG-NEXT: s_and_b32 s3, s3, 0x8000
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s2, s3, s2
+; GFX10-SAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-SAFE-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-SAFE-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX10-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX10-SAFE-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: fptrunc_f64_to_f16_afn:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX10-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16_afn:
+; GFX10-UNSAFE-SDAG: ; %bb.0:
+; GFX10-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX10-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-UNSAFE-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX10-UNSAFE-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-UNSAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX10-UNSAFE-SDAG-NEXT: s_endpgm
+;
+; GFX11-SAFE-SDAG-LABEL: fptrunc_f64_to_f16_afn:
+; GFX11-SAFE-SDAG: ; %bb.0:
+; GFX11-SAFE-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SAFE-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff
+; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s5, s3, 8
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s2, s4, s2
+; GFX11-SAFE-SDAG-NEXT: s_and_b32 s4, s5, 0xffe
+; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-SAFE-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014
+; GFX11-SAFE-SDAG-NEXT: s_sub_i32 s5, 0x3f1, s2
+; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-SDAG-NEXT: v_med3_i32 v1, s5, 0, 13
+; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0
+; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s6, v1
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s5
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000
+; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s7, s5, s6
+; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s6, s7, s6
+; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, s5
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-SAFE-SDAG-NEXT: s_addk_i32 s2, 0xfc10
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s7, s5
+; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s6, s2, 12
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s6, s4, s6
+; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s6
+; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT: s_and_b32 s6, s5, 7
+; GFX11-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s7, 1, 0
+; GFX11-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s6, s6, s7
+; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s6
+; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 31
+; GFX11-SAFE-SDAG-NEXT: s_movk_i32 s6, 0x7e00
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s4, s6, 0x7c00
+; GFX11-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s2, 0x40f
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s2, s4, s5
+; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s3, s3, 16
+; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT: s_and_b32 s3, s3, 0x8000
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s2, s3, s2
+; GFX11-SAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SAFE-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SAFE-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SAFE-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-SAFE-SDAG-NEXT: s_endpgm
+;
+; GFX11-SAFE-GISEL-LABEL: fptrunc_f64_to_f16_afn:
; GFX11-SAFE-GISEL: ; %bb.0:
; GFX11-SAFE-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SAFE-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
-; GFX11-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
-; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s5, s3, 8
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s6, s2
-; GFX11-SAFE-GISEL-NEXT: s_addk_i32 s4, 0xfc10
-; GFX11-SAFE-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX11-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s5, s2
-; GFX11-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s5, 1, 0
-; GFX11-SAFE-GISEL-NEXT: s_sub_i32 s6, 1, s4
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s8, s2, 0x1000
-; GFX11-SAFE-GISEL-NEXT: s_max_i32 s6, s6, 0
-; GFX11-SAFE-GISEL-NEXT: s_lshl_b32 s7, s4, 12
-; GFX11-SAFE-GISEL-NEXT: s_min_i32 s6, s6, 13
-; GFX11-SAFE-GISEL-NEXT: s_lshl_b32 s5, s5, 9
-; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s9, s8, s6
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s2, s7
-; GFX11-SAFE-GISEL-NEXT: s_lshl_b32 s6, s9, s6
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
-; GFX11-SAFE-GISEL-NEXT: s_cmp_lg_u32 s6, s8
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0
-; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s6, s9, s6
-; GFX11-SAFE-GISEL-NEXT: s_cmp_lt_i32 s4, 1
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, s6, s2
-; GFX11-SAFE-GISEL-NEXT: s_and_b32 s6, s2, 7
-; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s2, s2, 2
-; GFX11-SAFE-GISEL-NEXT: s_cmp_eq_u32 s6, 3
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s7, 1, 0
-; GFX11-SAFE-GISEL-NEXT: s_cmp_gt_i32 s6, 5
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0
-; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s6, s7, s6
-; GFX11-SAFE-GISEL-NEXT: s_add_i32 s2, s2, s6
-; GFX11-SAFE-GISEL-NEXT: s_cmp_gt_i32 s4, 30
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
-; GFX11-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, s5, s2
-; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s3, s3, 16
-; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s3, s2
-; GFX11-SAFE-GISEL-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
; GFX11-SAFE-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-SAFE-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-GISEL-NEXT: v_cvt_f16_f32_e32 v0.l, v0
; GFX11-SAFE-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-SAFE-GISEL-NEXT: s_endpgm
;
-; GFX11-UNSAFE-DAG-TRUE16-LABEL: fptrunc_f64_to_f16:
+; GFX11-UNSAFE-DAG-TRUE16-LABEL: fptrunc_f64_to_f16_afn:
; GFX11-UNSAFE-DAG-TRUE16: ; %bb.0:
; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
@@ -528,7 +889,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-UNSAFE-DAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_endpgm
;
-; GFX11-UNSAFE-DAG-FAKE16-LABEL: fptrunc_f64_to_f16:
+; GFX11-UNSAFE-DAG-FAKE16-LABEL: fptrunc_f64_to_f16_afn:
; GFX11-UNSAFE-DAG-FAKE16: ; %bb.0:
; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
@@ -540,7 +901,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-UNSAFE-DAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_endpgm
;
-; GFX11-UNSAFE-GISEL-TRUE16-LABEL: fptrunc_f64_to_f16:
+; GFX11-UNSAFE-GISEL-TRUE16-LABEL: fptrunc_f64_to_f16_afn:
; GFX11-UNSAFE-GISEL-TRUE16: ; %bb.0:
; GFX11-UNSAFE-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-UNSAFE-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
@@ -552,7 +913,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-UNSAFE-GISEL-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-UNSAFE-GISEL-TRUE16-NEXT: s_endpgm
;
-; GFX11-UNSAFE-GISEL-FAKE16-LABEL: fptrunc_f64_to_f16:
+; GFX11-UNSAFE-GISEL-FAKE16-LABEL: fptrunc_f64_to_f16_afn:
; GFX11-UNSAFE-GISEL-FAKE16: ; %bb.0:
; GFX11-UNSAFE-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-UNSAFE-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
@@ -563,7 +924,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-UNSAFE-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-UNSAFE-GISEL-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-UNSAFE-GISEL-FAKE16-NEXT: s_endpgm
- %result = fptrunc double %in to half
+ %result = fptrunc afn double %in to half
%result_i16 = bitcast half %result to i16
store i16 %result_i16, ptr addrspace(1) %out
ret void
@@ -662,6 +1023,99 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do
ret void
}
+define amdgpu_kernel void @fptrunc_v2f64_to_v2f32_afn(ptr addrspace(1) %out, <2 x double> %in) {
+; SI-LABEL: fptrunc_v2f64_to_v2f32_afn:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
+; SI-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f32_afn:
+; VI-SDAG: ; %bb.0:
+; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s6, -1
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; VI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-SDAG-NEXT: s_endpgm
+;
+; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f32_afn:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; VI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
+; VI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-GISEL-NEXT: s_endpgm
+;
+; GFX10-SDAG-LABEL: fptrunc_v2f64_to_v2f32_afn:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: fptrunc_v2f64_to_v2f32_afn:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: fptrunc_v2f64_to_v2f32_afn:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: fptrunc_v2f64_to_v2f32_afn:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-GISEL-NEXT: s_endpgm
+ %result = fptrunc afn <2 x double> %in to <2 x float>
+ store <2 x float> %result, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x double> %in) {
; SI-LABEL: fptrunc_v3f64_to_v3f32:
; SI: ; %bb.0:
@@ -769,6 +1223,113 @@ define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x do
ret void
}
+define amdgpu_kernel void @fptrunc_v3f64_to_v3f32_afn(ptr addrspace(1) %out, <3 x double> %in) {
+; SI-LABEL: fptrunc_v3f64_to_v3f32_afn:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x11
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x15
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; SI-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; SI-NEXT: v_cvt_f32_f64_e32 v2, s[4:5]
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-SDAG-LABEL: fptrunc_v3f64_to_v3f32_afn:
+; VI-SDAG: ; %bb.0:
+; VI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x54
+; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44
+; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[6:7]
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s6, -1
+; VI-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; VI-SDAG-NEXT: s_endpgm
+;
+; VI-GISEL-LABEL: fptrunc_v3f64_to_v3f32_afn:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-GISEL-NEXT: s_mov_b32 s2, -1
+; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; VI-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; VI-GISEL-NEXT: s_endpgm
+;
+; GFX10-SDAG-LABEL: fptrunc_v3f64_to_v3f32_afn:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x54
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[6:7]
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: fptrunc_v3f64_to_v3f32_afn:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX10-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: fptrunc_v3f64_to_v3f32_afn:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x54
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x44
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[6:7]
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], 0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: fptrunc_v3f64_to_v3f32_afn:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b256 s[8:15], s[4:5], 0x44
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX11-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[0:3], 0
+; GFX11-GISEL-NEXT: s_endpgm
+ %result = fptrunc afn <3 x double> %in to <3 x float>
+ store <3 x float> %result, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x double> %in) {
; SI-LABEL: fptrunc_v4f64_to_v4f32:
; SI: ; %bb.0:
@@ -876,6 +1437,113 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do
ret void
}
+define amdgpu_kernel void @fptrunc_v4f64_to_v4f32_afn(ptr addrspace(1) %out, <4 x double> %in) {
+; SI-LABEL: fptrunc_v4f64_to_v4f32_afn:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f64_e32 v3, s[14:15]
+; SI-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; SI-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; SI-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-SDAG-LABEL: fptrunc_v4f64_to_v4f32_afn:
+; VI-SDAG: ; %bb.0:
+; VI-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[14:15]
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; VI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-SDAG-NEXT: s_endpgm
+;
+; VI-GISEL-LABEL: fptrunc_v4f64_to_v4f32_afn:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-GISEL-NEXT: s_mov_b32 s2, -1
+; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[14:15]
+; VI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-GISEL-NEXT: s_endpgm
+;
+; GFX10-SDAG-LABEL: fptrunc_v4f64_to_v4f32_afn:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[14:15]
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: fptrunc_v4f64_to_v4f32_afn:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[14:15]
+; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: fptrunc_v4f64_to_v4f32_afn:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b256 s[8:15], s[4:5], 0x44
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[14:15]
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX11-SDAG-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: fptrunc_v4f64_to_v4f32_afn:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b256 s[8:15], s[4:5], 0x44
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[14:15]
+; GFX11-GISEL-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-GISEL-NEXT: s_endpgm
+ %result = fptrunc afn <4 x double> %in to <4 x float>
+ store <4 x float> %result, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x double> %in) {
; SI-LABEL: fptrunc_v8f64_to_v8f32:
; SI: ; %bb.0:
@@ -1019,3 +1687,150 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do
store <8 x float> %result, ptr addrspace(1) %out
ret void
}
+
+define amdgpu_kernel void @fptrunc_v8f64_to_v8f32_afn(ptr addrspace(1) %out, <8 x double> %in) {
+; SI-LABEL: fptrunc_v8f64_to_v8f32_afn:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f64_e32 v3, s[14:15]
+; SI-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; SI-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; SI-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; SI-NEXT: v_cvt_f32_f64_e32 v7, s[22:23]
+; SI-NEXT: v_cvt_f32_f64_e32 v6, s[20:21]
+; SI-NEXT: v_cvt_f32_f64_e32 v5, s[18:19]
+; SI-NEXT: v_cvt_f32_f64_e32 v4, s[16:17]
+; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-SDAG-LABEL: fptrunc_v8f64_to_v8f32_afn:
+; VI-SDAG: ; %bb.0:
+; VI-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v7, s[22:23]
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v6, s[20:21]
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v5, s[18:19]
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v4, s[16:17]
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[14:15]
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; VI-SDAG-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-SDAG-NEXT: s_endpgm
+;
+; VI-GISEL-LABEL: fptrunc_v8f64_to_v8f32_afn:
+; VI-GISEL: ; %bb.0:
+; VI-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-GISEL-NEXT: s_mov_b32 s2, -1
+; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[14:15]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v4, s[16:17]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v5, s[18:19]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v6, s[20:21]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v7, s[22:23]
+; VI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-GISEL-NEXT: s_endpgm
+;
+; GFX10-SDAG-LABEL: fptrunc_v8f64_to_v8f32_afn:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v7, s[22:23]
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v6, s[20:21]
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v5, s[18:19]
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v4, s[16:17]
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[14:15]
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX10-GISEL-LABEL: fptrunc_v8f64_to_v8f32_afn:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[14:15]
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v4, s[16:17]
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v5, s[18:19]
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v6, s[20:21]
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v7, s[22:23]
+; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: fptrunc_v8f64_to_v8f32_afn:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0x64
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v7, s[22:23]
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v6, s[20:21]
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v5, s[18:19]
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v4, s[16:17]
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[14:15]
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16
+; GFX11-SDAG-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: fptrunc_v8f64_to_v8f32_afn:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0x64
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[14:15]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v4, s[16:17]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v5, s[18:19]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v6, s[20:21]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v7, s[22:23]
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16
+; GFX11-GISEL-NEXT: s_endpgm
+ %result = fptrunc <8 x double> %in to <8 x float>
+ store <8 x float> %result, ptr addrspace(1) %out
+ ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10-SAFE-GISEL: {{.*}}
+; VI-SAFE-GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
index 7d85d34..beda16c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
@@ -1,13 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-VGPRCD,GFX942-SDAG,GFX942-VGPRCD-SDAG %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-VGPRCD,GFX942-GISEL,GFX942-VGPRCD-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX942,GFX942-AGPRCD,GFX942-SDAG,GFX942-AGPRCD-SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX942,GFX942-AGPRCD,GFX942-GISEL,GFX942-AGPRCD-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GFX950,GFX950-VGPRCD,GFX950-SDAG,GFX950-VGPRCD-SDAG %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GFX950,GFX950-VGPRCD,GFX950-GISEL,GFX950-VGPRCD-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX950,GFX950-AGPRCD,GFX950-SDAG,GFX950-AGPRCD-SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx950 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX950,GFX950-AGPRCD,GFX950-GISEL,GFX950-AGPRCD-GISEL %s
declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64, i64, <4 x i32>, i32, i32, i32)
declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64, i64, <16 x i32>, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
index d358837..8081a15 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
@@ -252,62 +252,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GCN-NEXT: v_mov_b32_e32 v8, 0
+; GCN-NEXT: v_mov_b32_e32 v44, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GCN-NEXT: v_accvgpr_write_b32 a31, s23
-; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GCN-NEXT: v_accvgpr_write_b32 a30, s22
-; GCN-NEXT: v_accvgpr_write_b32 a29, s21
-; GCN-NEXT: v_accvgpr_write_b32 a28, s20
-; GCN-NEXT: v_accvgpr_write_b32 a27, s19
-; GCN-NEXT: v_accvgpr_write_b32 a26, s18
-; GCN-NEXT: v_accvgpr_write_b32 a25, s17
-; GCN-NEXT: v_accvgpr_write_b32 a24, s16
-; GCN-NEXT: v_accvgpr_write_b32 a23, s15
-; GCN-NEXT: v_accvgpr_write_b32 a22, s14
-; GCN-NEXT: v_accvgpr_write_b32 a21, s13
-; GCN-NEXT: v_accvgpr_write_b32 a20, s12
-; GCN-NEXT: v_accvgpr_write_b32 a19, s11
-; GCN-NEXT: v_accvgpr_write_b32 a18, s10
-; GCN-NEXT: v_accvgpr_write_b32 a17, s9
-; GCN-NEXT: v_accvgpr_write_b32 a16, s8
-; GCN-NEXT: v_mov_b32_e32 v10, s20
-; GCN-NEXT: v_mov_b32_e32 v11, s21
-; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31]
-; GCN-NEXT: v_mov_b32_e32 v12, s22
-; GCN-NEXT: v_mov_b32_e32 v13, s23
-; GCN-NEXT: v_mov_b32_e32 v0, s16
-; GCN-NEXT: v_mov_b32_e32 v1, s17
-; GCN-NEXT: v_mov_b32_e32 v2, s18
-; GCN-NEXT: v_mov_b32_e32 v3, s19
-; GCN-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GCN-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; GCN-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GCN-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GCN-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GCN-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; GCN-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; GCN-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v40, s20
+; GCN-NEXT: v_mov_b32_e32 v41, s21
+; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31]
+; GCN-NEXT: v_mov_b32_e32 v42, s22
+; GCN-NEXT: v_mov_b32_e32 v43, s23
+; GCN-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 2
+; GCN-NEXT: v_mov_b32_e32 v16, s16
+; GCN-NEXT: v_mov_b32_e32 v17, s17
+; GCN-NEXT: v_mov_b32_e32 v18, s18
+; GCN-NEXT: v_mov_b32_e32 v19, s19
+; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v0, s12
-; GCN-NEXT: v_mov_b32_e32 v1, s13
-; GCN-NEXT: v_mov_b32_e32 v2, s14
-; GCN-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT: v_mov_b32_e32 v16, s12
+; GCN-NEXT: v_mov_b32_e32 v17, s13
+; GCN-NEXT: v_mov_b32_e32 v18, s14
+; GCN-NEXT: v_mov_b32_e32 v19, s15
+; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NEXT: v_mov_b32_e32 v2, s10
-; GCN-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; GCN-NEXT: v_mov_b32_e32 v16, s8
+; GCN-NEXT: v_mov_b32_e32 v17, s9
+; GCN-NEXT: v_mov_b32_e32 v18, s10
+; GCN-NEXT: v_mov_b32_e32 v19, s11
+; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
@@ -322,62 +315,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa
; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GCN-NEXT: v_mov_b32_e32 v8, 0
+; GCN-NEXT: v_mov_b32_e32 v44, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GCN-NEXT: v_accvgpr_write_b32 a31, s23
-; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GCN-NEXT: v_accvgpr_write_b32 a30, s22
-; GCN-NEXT: v_accvgpr_write_b32 a29, s21
-; GCN-NEXT: v_accvgpr_write_b32 a28, s20
-; GCN-NEXT: v_accvgpr_write_b32 a27, s19
-; GCN-NEXT: v_accvgpr_write_b32 a26, s18
-; GCN-NEXT: v_accvgpr_write_b32 a25, s17
-; GCN-NEXT: v_accvgpr_write_b32 a24, s16
-; GCN-NEXT: v_accvgpr_write_b32 a23, s15
-; GCN-NEXT: v_accvgpr_write_b32 a22, s14
-; GCN-NEXT: v_accvgpr_write_b32 a21, s13
-; GCN-NEXT: v_accvgpr_write_b32 a20, s12
-; GCN-NEXT: v_accvgpr_write_b32 a19, s11
-; GCN-NEXT: v_accvgpr_write_b32 a18, s10
-; GCN-NEXT: v_accvgpr_write_b32 a17, s9
-; GCN-NEXT: v_accvgpr_write_b32 a16, s8
-; GCN-NEXT: v_mov_b32_e32 v10, s20
-; GCN-NEXT: v_mov_b32_e32 v11, s21
-; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; GCN-NEXT: v_mov_b32_e32 v12, s22
-; GCN-NEXT: v_mov_b32_e32 v13, s23
-; GCN-NEXT: v_mov_b32_e32 v0, s16
-; GCN-NEXT: v_mov_b32_e32 v1, s17
-; GCN-NEXT: v_mov_b32_e32 v2, s18
-; GCN-NEXT: v_mov_b32_e32 v3, s19
-; GCN-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GCN-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; GCN-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GCN-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GCN-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GCN-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; GCN-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; GCN-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v40, s20
+; GCN-NEXT: v_mov_b32_e32 v41, s21
+; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; GCN-NEXT: v_mov_b32_e32 v42, s22
+; GCN-NEXT: v_mov_b32_e32 v43, s23
+; GCN-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 2
+; GCN-NEXT: v_mov_b32_e32 v16, s16
+; GCN-NEXT: v_mov_b32_e32 v17, s17
+; GCN-NEXT: v_mov_b32_e32 v18, s18
+; GCN-NEXT: v_mov_b32_e32 v19, s19
+; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v0, s12
-; GCN-NEXT: v_mov_b32_e32 v1, s13
-; GCN-NEXT: v_mov_b32_e32 v2, s14
-; GCN-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT: v_mov_b32_e32 v16, s12
+; GCN-NEXT: v_mov_b32_e32 v17, s13
+; GCN-NEXT: v_mov_b32_e32 v18, s14
+; GCN-NEXT: v_mov_b32_e32 v19, s15
+; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NEXT: v_mov_b32_e32 v2, s10
-; GCN-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; GCN-NEXT: v_mov_b32_e32 v16, s8
+; GCN-NEXT: v_mov_b32_e32 v17, s9
+; GCN-NEXT: v_mov_b32_e32 v18, s10
+; GCN-NEXT: v_mov_b32_e32 v19, s11
+; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 1, i32 2, i32 3)
@@ -393,35 +379,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat>
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GCN-NEXT: v_accvgpr_write_b32 a0, s8
-; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GCN-NEXT: v_accvgpr_write_b32 a1, s9
-; GCN-NEXT: v_accvgpr_write_b32 a2, s10
-; GCN-NEXT: v_accvgpr_write_b32 a3, s11
-; GCN-NEXT: v_accvgpr_write_b32 a4, s12
-; GCN-NEXT: v_accvgpr_write_b32 a5, s13
-; GCN-NEXT: v_accvgpr_write_b32 a6, s14
-; GCN-NEXT: v_accvgpr_write_b32 a7, s15
-; GCN-NEXT: v_accvgpr_write_b32 a8, s16
-; GCN-NEXT: v_accvgpr_write_b32 a9, s17
-; GCN-NEXT: v_accvgpr_write_b32 a10, s18
-; GCN-NEXT: v_accvgpr_write_b32 a11, s19
-; GCN-NEXT: v_accvgpr_write_b32 a12, s20
-; GCN-NEXT: v_accvgpr_write_b32 a13, s21
-; GCN-NEXT: v_accvgpr_write_b32 a14, s22
-; GCN-NEXT: v_accvgpr_write_b32 a15, s23
+; GCN-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; GCN-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GCN-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GCN-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GCN-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GCN-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GCN-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GCN-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
-; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15]
+; GCN-NEXT: v_mov_b32_e32 v16, 0
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 2
-; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; GCN-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
store <16 x float> %result, ptr addrspace(1) %out
@@ -435,40 +413,32 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf
; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GCN-NEXT: v_accvgpr_write_b32 a0, s8
-; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GCN-NEXT: v_accvgpr_write_b32 a1, s9
-; GCN-NEXT: v_accvgpr_write_b32 a2, s10
-; GCN-NEXT: v_accvgpr_write_b32 a3, s11
-; GCN-NEXT: v_accvgpr_write_b32 a4, s12
-; GCN-NEXT: v_accvgpr_write_b32 a5, s13
-; GCN-NEXT: v_accvgpr_write_b32 a6, s14
-; GCN-NEXT: v_accvgpr_write_b32 a7, s15
-; GCN-NEXT: v_accvgpr_write_b32 a8, s16
-; GCN-NEXT: v_accvgpr_write_b32 a9, s17
-; GCN-NEXT: v_accvgpr_write_b32 a10, s18
-; GCN-NEXT: v_accvgpr_write_b32 a11, s19
-; GCN-NEXT: v_accvgpr_write_b32 a12, s20
-; GCN-NEXT: v_accvgpr_write_b32 a13, s21
-; GCN-NEXT: v_accvgpr_write_b32 a14, s22
-; GCN-NEXT: v_accvgpr_write_b32 a15, s23
+; GCN-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; GCN-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GCN-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GCN-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GCN-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GCN-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GCN-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GCN-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; GCN-NEXT: v_mov_b32_e32 v16, 0
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 2
-; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; GCN-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 3, i32 2, i32 1)
store <16 x float> %result, ptr addrspace(1) %out
ret void
}
-attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
+attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" }
attributes #1 = { "amdgpu-flat-work-group-size"="1,64" }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 21465be..d81ec1c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -141,20 +141,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
@@ -166,16 +164,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
@@ -183,20 +179,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v12, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
@@ -266,20 +260,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
@@ -291,16 +283,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
@@ -308,20 +298,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v12, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
@@ -1505,62 +1493,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v44, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a29, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a28, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a27, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a26, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a25, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a24, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a23, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a22, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a21, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a20, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a19, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
-; SDAG-NEXT: v_mov_b32_e32 v10, s20
-; SDAG-NEXT: v_mov_b32_e32 v11, s21
-; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31]
-; SDAG-NEXT: v_mov_b32_e32 v12, s22
-; SDAG-NEXT: v_mov_b32_e32 v13, s23
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; SDAG-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; SDAG-NEXT: v_mov_b32_e32 v40, s20
+; SDAG-NEXT: v_mov_b32_e32 v41, s21
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
+; SDAG-NEXT: v_mov_b32_e32 v42, s22
+; SDAG-NEXT: v_mov_b32_e32 v43, s23
+; SDAG-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: s_nop 2
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -1569,52 +1550,44 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GISEL-NEXT: v_mov_b32_e32 v24, 0
+; GISEL-NEXT: v_mov_b32_e32 v56, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
+; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15]
+; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -1623,62 +1596,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v44, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
-; HEURRC-NEXT: v_mov_b32_e32 v10, s20
-; HEURRC-NEXT: v_mov_b32_e32 v11, s21
-; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31]
-; HEURRC-NEXT: v_mov_b32_e32 v12, s22
-; HEURRC-NEXT: v_mov_b32_e32 v13, s23
-; HEURRC-NEXT: v_mov_b32_e32 v0, s16
-; HEURRC-NEXT: v_mov_b32_e32 v1, s17
-; HEURRC-NEXT: v_mov_b32_e32 v2, s18
-; HEURRC-NEXT: v_mov_b32_e32 v3, s19
-; HEURRC-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; HEURRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; HEURRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; HEURRC-NEXT: v_mov_b32_e32 v40, s20
+; HEURRC-NEXT: v_mov_b32_e32 v41, s21
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
+; HEURRC-NEXT: v_mov_b32_e32 v42, s22
+; HEURRC-NEXT: v_mov_b32_e32 v43, s23
+; HEURRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: s_nop 2
+; HEURRC-NEXT: v_mov_b32_e32 v16, s16
+; HEURRC-NEXT: v_mov_b32_e32 v17, s17
+; HEURRC-NEXT: v_mov_b32_e32 v18, s18
+; HEURRC-NEXT: v_mov_b32_e32 v19, s19
+; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s12
-; HEURRC-NEXT: v_mov_b32_e32 v1, s13
-; HEURRC-NEXT: v_mov_b32_e32 v2, s14
-; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s12
+; HEURRC-NEXT: v_mov_b32_e32 v17, s13
+; HEURRC-NEXT: v_mov_b32_e32 v18, s14
+; HEURRC-NEXT: v_mov_b32_e32 v19, s15
+; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s8
+; HEURRC-NEXT: v_mov_b32_e32 v17, s9
+; HEURRC-NEXT: v_mov_b32_e32 v18, s10
+; HEURRC-NEXT: v_mov_b32_e32 v19, s11
+; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -1687,7 +1653,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; VGPRRC-NEXT: v_mov_b32_e32 v40, 0
+; VGPRRC-NEXT: v_mov_b32_e32 v44, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
@@ -1701,41 +1667,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; VGPRRC-NEXT: v_mov_b32_e32 v42, s20
-; VGPRRC-NEXT: v_mov_b32_e32 v43, s21
+; VGPRRC-NEXT: v_mov_b32_e32 v40, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v41, s21
; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
-; VGPRRC-NEXT: v_mov_b32_e32 v44, s22
-; VGPRRC-NEXT: v_mov_b32_e32 v45, s23
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[42:45], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: v_mov_b32_e32 v42, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v43, s23
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 2
; VGPRRC-NEXT: v_mov_b32_e32 v16, s16
; VGPRRC-NEXT: v_mov_b32_e32 v17, s17
; VGPRRC-NEXT: v_mov_b32_e32 v18, s18
; VGPRRC-NEXT: v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s12
; VGPRRC-NEXT: v_mov_b32_e32 v17, s13
; VGPRRC-NEXT: v_mov_b32_e32 v18, s14
; VGPRRC-NEXT: v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s8
; VGPRRC-NEXT: v_mov_b32_e32 v17, s9
; VGPRRC-NEXT: v_mov_b32_e32 v18, s10
; VGPRRC-NEXT: v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd:
@@ -1869,62 +1835,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v44, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a29, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a28, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a27, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a26, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a25, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a24, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a23, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a22, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a21, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a20, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a19, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
-; SDAG-NEXT: v_mov_b32_e32 v10, s20
-; SDAG-NEXT: v_mov_b32_e32 v11, s21
-; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; SDAG-NEXT: v_mov_b32_e32 v12, s22
-; SDAG-NEXT: v_mov_b32_e32 v13, s23
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; SDAG-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; SDAG-NEXT: v_mov_b32_e32 v40, s20
+; SDAG-NEXT: v_mov_b32_e32 v41, s21
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; SDAG-NEXT: v_mov_b32_e32 v42, s22
+; SDAG-NEXT: v_mov_b32_e32 v43, s23
+; SDAG-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: s_nop 2
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -1933,52 +1892,44 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GISEL-NEXT: v_mov_b32_e32 v24, 0
+; GISEL-NEXT: v_mov_b32_e32 v56, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:1 abid:2 blgp:3
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
+; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:1 abid:2 blgp:3
+; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -1987,62 +1938,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v44, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
-; HEURRC-NEXT: v_mov_b32_e32 v10, s20
-; HEURRC-NEXT: v_mov_b32_e32 v11, s21
-; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; HEURRC-NEXT: v_mov_b32_e32 v12, s22
-; HEURRC-NEXT: v_mov_b32_e32 v13, s23
-; HEURRC-NEXT: v_mov_b32_e32 v0, s16
-; HEURRC-NEXT: v_mov_b32_e32 v1, s17
-; HEURRC-NEXT: v_mov_b32_e32 v2, s18
-; HEURRC-NEXT: v_mov_b32_e32 v3, s19
-; HEURRC-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; HEURRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; HEURRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; HEURRC-NEXT: v_mov_b32_e32 v40, s20
+; HEURRC-NEXT: v_mov_b32_e32 v41, s21
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; HEURRC-NEXT: v_mov_b32_e32 v42, s22
+; HEURRC-NEXT: v_mov_b32_e32 v43, s23
+; HEURRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: s_nop 2
+; HEURRC-NEXT: v_mov_b32_e32 v16, s16
+; HEURRC-NEXT: v_mov_b32_e32 v17, s17
+; HEURRC-NEXT: v_mov_b32_e32 v18, s18
+; HEURRC-NEXT: v_mov_b32_e32 v19, s19
+; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s12
-; HEURRC-NEXT: v_mov_b32_e32 v1, s13
-; HEURRC-NEXT: v_mov_b32_e32 v2, s14
-; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s12
+; HEURRC-NEXT: v_mov_b32_e32 v17, s13
+; HEURRC-NEXT: v_mov_b32_e32 v18, s14
+; HEURRC-NEXT: v_mov_b32_e32 v19, s15
+; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s8
+; HEURRC-NEXT: v_mov_b32_e32 v17, s9
+; HEURRC-NEXT: v_mov_b32_e32 v18, s10
+; HEURRC-NEXT: v_mov_b32_e32 v19, s11
+; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -2051,7 +1995,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; VGPRRC-NEXT: v_mov_b32_e32 v40, 0
+; VGPRRC-NEXT: v_mov_b32_e32 v44, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
@@ -2065,41 +2009,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; VGPRRC-NEXT: v_mov_b32_e32 v42, s20
-; VGPRRC-NEXT: v_mov_b32_e32 v43, s21
+; VGPRRC-NEXT: v_mov_b32_e32 v40, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v41, s21
; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; VGPRRC-NEXT: v_mov_b32_e32 v44, s22
-; VGPRRC-NEXT: v_mov_b32_e32 v45, s23
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[42:45], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: v_mov_b32_e32 v42, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v43, s23
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 2
; VGPRRC-NEXT: v_mov_b32_e32 v16, s16
; VGPRRC-NEXT: v_mov_b32_e32 v17, s17
; VGPRRC-NEXT: v_mov_b32_e32 v18, s18
; VGPRRC-NEXT: v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s12
; VGPRRC-NEXT: v_mov_b32_e32 v17, s13
; VGPRRC-NEXT: v_mov_b32_e32 v18, s14
; VGPRRC-NEXT: v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s8
; VGPRRC-NEXT: v_mov_b32_e32 v17, s9
; VGPRRC-NEXT: v_mov_b32_e32 v18, s10
; VGPRRC-NEXT: v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags:
@@ -2234,35 +2178,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
@@ -2271,35 +2207,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
@@ -2308,35 +2236,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
-; HEURRC-NEXT: v_mov_b32_e32 v0, 0
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
+; HEURRC-NEXT: v_mov_b32_e32 v16, 0
; HEURRC-NEXT: s_nop 7
; HEURRC-NEXT: s_nop 2
-; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
@@ -2443,35 +2363,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
@@ -2480,35 +2392,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
@@ -2517,35 +2421,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; HEURRC-NEXT: v_mov_b32_e32 v0, 0
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: v_mov_b32_e32 v16, 0
; HEURRC-NEXT: s_nop 7
; HEURRC-NEXT: s_nop 2
-; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
@@ -2781,24 +2677,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v2, s8
-; SDAG-NEXT: v_mov_b32_e32 v3, s9
-; SDAG-NEXT: v_mov_b32_e32 v4, s10
-; SDAG-NEXT: v_mov_b32_e32 v5, s11
-; SDAG-NEXT: v_mov_b32_e32 v6, s12
-; SDAG-NEXT: v_mov_b32_e32 v7, s13
-; SDAG-NEXT: v_mov_b32_e32 v8, s14
-; SDAG-NEXT: v_mov_b32_e32 v9, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, s8
+; SDAG-NEXT: v_mov_b32_e32 v1, s9
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: v_mov_b32_e32 v4, s12
+; SDAG-NEXT: v_mov_b32_e32 v5, s13
+; SDAG-NEXT: v_mov_b32_e32 v6, s14
+; SDAG-NEXT: v_mov_b32_e32 v7, s15
+; SDAG-NEXT: v_mov_b32_e32 v8, s0
+; SDAG-NEXT: v_mov_b32_e32 v9, s1
+; SDAG-NEXT: v_mov_b32_e32 v10, s2
+; SDAG-NEXT: v_mov_b32_e32 v11, s3
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3]
+; SDAG-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
@@ -2810,16 +2706,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
@@ -2827,24 +2721,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; HEURRC-NEXT: v_mov_b32_e32 v0, 0
+; HEURRC-NEXT: v_mov_b32_e32 v12, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v2, s8
-; HEURRC-NEXT: v_mov_b32_e32 v3, s9
-; HEURRC-NEXT: v_mov_b32_e32 v4, s10
-; HEURRC-NEXT: v_mov_b32_e32 v5, s11
-; HEURRC-NEXT: v_mov_b32_e32 v6, s12
-; HEURRC-NEXT: v_mov_b32_e32 v7, s13
-; HEURRC-NEXT: v_mov_b32_e32 v8, s14
-; HEURRC-NEXT: v_mov_b32_e32 v9, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: v_mov_b32_e32 v0, s8
+; HEURRC-NEXT: v_mov_b32_e32 v1, s9
+; HEURRC-NEXT: v_mov_b32_e32 v2, s10
+; HEURRC-NEXT: v_mov_b32_e32 v3, s11
+; HEURRC-NEXT: v_mov_b32_e32 v4, s12
+; HEURRC-NEXT: v_mov_b32_e32 v5, s13
+; HEURRC-NEXT: v_mov_b32_e32 v6, s14
+; HEURRC-NEXT: v_mov_b32_e32 v7, s15
+; HEURRC-NEXT: v_mov_b32_e32 v8, s0
+; HEURRC-NEXT: v_mov_b32_e32 v9, s1
+; HEURRC-NEXT: v_mov_b32_e32 v10, s2
+; HEURRC-NEXT: v_mov_b32_e32 v11, s3
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3]
+; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
@@ -2852,24 +2746,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; VGPRRC-NEXT: v_mov_b32_e32 v4, 0
+; VGPRRC-NEXT: v_mov_b32_e32 v12, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v6, s8
-; VGPRRC-NEXT: v_mov_b32_e32 v7, s9
-; VGPRRC-NEXT: v_mov_b32_e32 v8, s10
-; VGPRRC-NEXT: v_mov_b32_e32 v9, s11
-; VGPRRC-NEXT: v_mov_b32_e32 v10, s12
-; VGPRRC-NEXT: v_mov_b32_e32 v11, s13
-; VGPRRC-NEXT: v_mov_b32_e32 v12, s14
-; VGPRRC-NEXT: v_mov_b32_e32 v13, s15
-; VGPRRC-NEXT: v_mov_b32_e32 v0, s0
-; VGPRRC-NEXT: v_mov_b32_e32 v1, s1
-; VGPRRC-NEXT: v_mov_b32_e32 v2, s2
-; VGPRRC-NEXT: v_mov_b32_e32 v3, s3
+; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
+; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
+; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
+; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
+; VGPRRC-NEXT: v_mov_b32_e32 v4, s12
+; VGPRRC-NEXT: v_mov_b32_e32 v5, s13
+; VGPRRC-NEXT: v_mov_b32_e32 v6, s14
+; VGPRRC-NEXT: v_mov_b32_e32 v7, s15
+; VGPRRC-NEXT: v_mov_b32_e32 v8, s0
+; VGPRRC-NEXT: v_mov_b32_e32 v9, s1
+; VGPRRC-NEXT: v_mov_b32_e32 v10, s2
+; VGPRRC-NEXT: v_mov_b32_e32 v11, s3
; VGPRRC-NEXT: s_nop 1
-; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[6:9], v[10:13], v[0:3]
+; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
; VGPRRC-NEXT: s_nop 7
-; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
; AGPR: ; %bb.0:
@@ -2930,24 +2824,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v2, s8
-; SDAG-NEXT: v_mov_b32_e32 v3, s9
-; SDAG-NEXT: v_mov_b32_e32 v4, s10
-; SDAG-NEXT: v_mov_b32_e32 v5, s11
-; SDAG-NEXT: v_mov_b32_e32 v6, s12
-; SDAG-NEXT: v_mov_b32_e32 v7, s13
-; SDAG-NEXT: v_mov_b32_e32 v8, s14
-; SDAG-NEXT: v_mov_b32_e32 v9, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, s8
+; SDAG-NEXT: v_mov_b32_e32 v1, s9
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: v_mov_b32_e32 v4, s12
+; SDAG-NEXT: v_mov_b32_e32 v5, s13
+; SDAG-NEXT: v_mov_b32_e32 v6, s14
+; SDAG-NEXT: v_mov_b32_e32 v7, s15
+; SDAG-NEXT: v_mov_b32_e32 v8, s0
+; SDAG-NEXT: v_mov_b32_e32 v9, s1
+; SDAG-NEXT: v_mov_b32_e32 v10, s2
+; SDAG-NEXT: v_mov_b32_e32 v11, s3
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
@@ -2959,16 +2853,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
@@ -2976,24 +2868,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; HEURRC-NEXT: v_mov_b32_e32 v0, 0
+; HEURRC-NEXT: v_mov_b32_e32 v12, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v2, s8
-; HEURRC-NEXT: v_mov_b32_e32 v3, s9
-; HEURRC-NEXT: v_mov_b32_e32 v4, s10
-; HEURRC-NEXT: v_mov_b32_e32 v5, s11
-; HEURRC-NEXT: v_mov_b32_e32 v6, s12
-; HEURRC-NEXT: v_mov_b32_e32 v7, s13
-; HEURRC-NEXT: v_mov_b32_e32 v8, s14
-; HEURRC-NEXT: v_mov_b32_e32 v9, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: v_mov_b32_e32 v0, s8
+; HEURRC-NEXT: v_mov_b32_e32 v1, s9
+; HEURRC-NEXT: v_mov_b32_e32 v2, s10
+; HEURRC-NEXT: v_mov_b32_e32 v3, s11
+; HEURRC-NEXT: v_mov_b32_e32 v4, s12
+; HEURRC-NEXT: v_mov_b32_e32 v5, s13
+; HEURRC-NEXT: v_mov_b32_e32 v6, s14
+; HEURRC-NEXT: v_mov_b32_e32 v7, s15
+; HEURRC-NEXT: v_mov_b32_e32 v8, s0
+; HEURRC-NEXT: v_mov_b32_e32 v9, s1
+; HEURRC-NEXT: v_mov_b32_e32 v10, s2
+; HEURRC-NEXT: v_mov_b32_e32 v11, s3
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
@@ -3001,24 +2893,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; VGPRRC-NEXT: v_mov_b32_e32 v4, 0
+; VGPRRC-NEXT: v_mov_b32_e32 v12, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v6, s8
-; VGPRRC-NEXT: v_mov_b32_e32 v7, s9
-; VGPRRC-NEXT: v_mov_b32_e32 v8, s10
-; VGPRRC-NEXT: v_mov_b32_e32 v9, s11
-; VGPRRC-NEXT: v_mov_b32_e32 v10, s12
-; VGPRRC-NEXT: v_mov_b32_e32 v11, s13
-; VGPRRC-NEXT: v_mov_b32_e32 v12, s14
-; VGPRRC-NEXT: v_mov_b32_e32 v13, s15
-; VGPRRC-NEXT: v_mov_b32_e32 v0, s0
-; VGPRRC-NEXT: v_mov_b32_e32 v1, s1
-; VGPRRC-NEXT: v_mov_b32_e32 v2, s2
-; VGPRRC-NEXT: v_mov_b32_e32 v3, s3
+; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
+; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
+; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
+; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
+; VGPRRC-NEXT: v_mov_b32_e32 v4, s12
+; VGPRRC-NEXT: v_mov_b32_e32 v5, s13
+; VGPRRC-NEXT: v_mov_b32_e32 v6, s14
+; VGPRRC-NEXT: v_mov_b32_e32 v7, s15
+; VGPRRC-NEXT: v_mov_b32_e32 v8, s0
+; VGPRRC-NEXT: v_mov_b32_e32 v9, s1
+; VGPRRC-NEXT: v_mov_b32_e32 v10, s2
+; VGPRRC-NEXT: v_mov_b32_e32 v11, s3
; VGPRRC-NEXT: s_nop 1
-; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
+; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
; VGPRRC-NEXT: s_nop 7
-; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
; AGPR: ; %bb.0:
@@ -4246,70 +4138,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v40, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v2, s20
-; SDAG-NEXT: v_mov_b32_e32 v3, s21
-; SDAG-NEXT: v_mov_b32_e32 v4, s22
-; SDAG-NEXT: v_mov_b32_e32 v5, s23
+; SDAG-NEXT: v_mov_b32_e32 v32, s20
+; SDAG-NEXT: v_mov_b32_e32 v33, s21
+; SDAG-NEXT: v_mov_b32_e32 v34, s22
+; SDAG-NEXT: v_mov_b32_e32 v35, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b32_e32 v6, s24
-; SDAG-NEXT: v_mov_b32_e32 v7, s25
-; SDAG-NEXT: v_mov_b32_e32 v8, s26
-; SDAG-NEXT: v_mov_b32_e32 v9, s27
+; SDAG-NEXT: v_mov_b32_e32 v36, s24
+; SDAG-NEXT: v_mov_b32_e32 v37, s25
+; SDAG-NEXT: v_mov_b32_e32 v38, s26
+; SDAG-NEXT: v_mov_b32_e32 v39, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a29, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a28, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a27, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a26, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a25, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a24, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a23, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a22, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a21, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a20, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a19, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
+; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31]
-; SDAG-NEXT: v_mov_b32_e32 v2, s20
-; SDAG-NEXT: v_mov_b32_e32 v3, s21
-; SDAG-NEXT: v_mov_b32_e32 v4, s22
-; SDAG-NEXT: v_mov_b32_e32 v5, s23
-; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
+; SDAG-NEXT: s_nop 6
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v2, s16
-; SDAG-NEXT: v_mov_b32_e32 v3, s17
-; SDAG-NEXT: v_mov_b32_e32 v4, s18
-; SDAG-NEXT: v_mov_b32_e32 v5, s19
-; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v2, s12
-; SDAG-NEXT: v_mov_b32_e32 v3, s13
-; SDAG-NEXT: v_mov_b32_e32 v4, s14
-; SDAG-NEXT: v_mov_b32_e32 v5, s15
-; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v2, s8
-; SDAG-NEXT: v_mov_b32_e32 v3, s9
-; SDAG-NEXT: v_mov_b32_e32 v4, s10
-; SDAG-NEXT: v_mov_b32_e32 v5, s11
-; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -4318,52 +4203,44 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GISEL-NEXT: v_mov_b32_e32 v24, 0
+; GISEL-NEXT: v_mov_b32_e32 v56, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
+; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15]
+; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -4371,70 +4248,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT: v_mov_b32_e32 v0, 0
+; HEURRC-NEXT: v_mov_b32_e32 v40, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v2, s20
-; HEURRC-NEXT: v_mov_b32_e32 v3, s21
-; HEURRC-NEXT: v_mov_b32_e32 v4, s22
-; HEURRC-NEXT: v_mov_b32_e32 v5, s23
+; HEURRC-NEXT: v_mov_b32_e32 v32, s20
+; HEURRC-NEXT: v_mov_b32_e32 v33, s21
+; HEURRC-NEXT: v_mov_b32_e32 v34, s22
+; HEURRC-NEXT: v_mov_b32_e32 v35, s23
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b32_e32 v6, s24
-; HEURRC-NEXT: v_mov_b32_e32 v7, s25
-; HEURRC-NEXT: v_mov_b32_e32 v8, s26
-; HEURRC-NEXT: v_mov_b32_e32 v9, s27
+; HEURRC-NEXT: v_mov_b32_e32 v36, s24
+; HEURRC-NEXT: v_mov_b32_e32 v37, s25
+; HEURRC-NEXT: v_mov_b32_e32 v38, s26
+; HEURRC-NEXT: v_mov_b32_e32 v39, s27
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23
-; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
+; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31]
-; HEURRC-NEXT: v_mov_b32_e32 v2, s20
-; HEURRC-NEXT: v_mov_b32_e32 v3, s21
-; HEURRC-NEXT: v_mov_b32_e32 v4, s22
-; HEURRC-NEXT: v_mov_b32_e32 v5, s23
-; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
+; HEURRC-NEXT: s_nop 6
+; HEURRC-NEXT: v_mov_b32_e32 v16, s20
+; HEURRC-NEXT: v_mov_b32_e32 v17, s21
+; HEURRC-NEXT: v_mov_b32_e32 v18, s22
+; HEURRC-NEXT: v_mov_b32_e32 v19, s23
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v2, s16
-; HEURRC-NEXT: v_mov_b32_e32 v3, s17
-; HEURRC-NEXT: v_mov_b32_e32 v4, s18
-; HEURRC-NEXT: v_mov_b32_e32 v5, s19
-; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s16
+; HEURRC-NEXT: v_mov_b32_e32 v17, s17
+; HEURRC-NEXT: v_mov_b32_e32 v18, s18
+; HEURRC-NEXT: v_mov_b32_e32 v19, s19
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v2, s12
-; HEURRC-NEXT: v_mov_b32_e32 v3, s13
-; HEURRC-NEXT: v_mov_b32_e32 v4, s14
-; HEURRC-NEXT: v_mov_b32_e32 v5, s15
-; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s12
+; HEURRC-NEXT: v_mov_b32_e32 v17, s13
+; HEURRC-NEXT: v_mov_b32_e32 v18, s14
+; HEURRC-NEXT: v_mov_b32_e32 v19, s15
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v2, s8
-; HEURRC-NEXT: v_mov_b32_e32 v3, s9
-; HEURRC-NEXT: v_mov_b32_e32 v4, s10
-; HEURRC-NEXT: v_mov_b32_e32 v5, s11
-; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s8
+; HEURRC-NEXT: v_mov_b32_e32 v17, s9
+; HEURRC-NEXT: v_mov_b32_e32 v18, s10
+; HEURRC-NEXT: v_mov_b32_e32 v19, s11
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -4442,17 +4312,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; VGPRRC-NEXT: v_mov_b32_e32 v32, 0
+; VGPRRC-NEXT: v_mov_b32_e32 v40, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v34, s20
-; VGPRRC-NEXT: v_mov_b32_e32 v35, s21
-; VGPRRC-NEXT: v_mov_b32_e32 v36, s22
-; VGPRRC-NEXT: v_mov_b32_e32 v37, s23
+; VGPRRC-NEXT: v_mov_b32_e32 v32, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v33, s21
+; VGPRRC-NEXT: v_mov_b32_e32 v34, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v35, s23
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT: v_mov_b32_e32 v38, s24
-; VGPRRC-NEXT: v_mov_b32_e32 v39, s25
-; VGPRRC-NEXT: v_mov_b32_e32 v40, s26
-; VGPRRC-NEXT: v_mov_b32_e32 v41, s27
+; VGPRRC-NEXT: v_mov_b32_e32 v36, s24
+; VGPRRC-NEXT: v_mov_b32_e32 v37, s25
+; VGPRRC-NEXT: v_mov_b32_e32 v38, s26
+; VGPRRC-NEXT: v_mov_b32_e32 v39, s27
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
@@ -4463,42 +4333,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; VGPRRC-NEXT: s_nop 1
-; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[34:37], v[38:41], v[16:31]
+; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
; VGPRRC-NEXT: s_nop 6
; VGPRRC-NEXT: v_mov_b32_e32 v16, s20
; VGPRRC-NEXT: v_mov_b32_e32 v17, s21
; VGPRRC-NEXT: v_mov_b32_e32 v18, s22
; VGPRRC-NEXT: v_mov_b32_e32 v19, s23
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s16
; VGPRRC-NEXT: v_mov_b32_e32 v17, s17
; VGPRRC-NEXT: v_mov_b32_e32 v18, s18
; VGPRRC-NEXT: v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s12
; VGPRRC-NEXT: v_mov_b32_e32 v17, s13
; VGPRRC-NEXT: v_mov_b32_e32 v18, s14
; VGPRRC-NEXT: v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s8
; VGPRRC-NEXT: v_mov_b32_e32 v17, s9
; VGPRRC-NEXT: v_mov_b32_e32 v18, s10
; VGPRRC-NEXT: v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd:
@@ -4645,70 +4515,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v40, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v2, s20
-; SDAG-NEXT: v_mov_b32_e32 v3, s21
-; SDAG-NEXT: v_mov_b32_e32 v4, s22
-; SDAG-NEXT: v_mov_b32_e32 v5, s23
+; SDAG-NEXT: v_mov_b32_e32 v32, s20
+; SDAG-NEXT: v_mov_b32_e32 v33, s21
+; SDAG-NEXT: v_mov_b32_e32 v34, s22
+; SDAG-NEXT: v_mov_b32_e32 v35, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b32_e32 v6, s24
-; SDAG-NEXT: v_mov_b32_e32 v7, s25
-; SDAG-NEXT: v_mov_b32_e32 v8, s26
-; SDAG-NEXT: v_mov_b32_e32 v9, s27
+; SDAG-NEXT: v_mov_b32_e32 v36, s24
+; SDAG-NEXT: v_mov_b32_e32 v37, s25
+; SDAG-NEXT: v_mov_b32_e32 v38, s26
+; SDAG-NEXT: v_mov_b32_e32 v39, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a29, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a28, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a27, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a26, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a25, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a24, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a23, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a22, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a21, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a20, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a19, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
+; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] cbsz:1 abid:2 blgp:3
-; SDAG-NEXT: v_mov_b32_e32 v2, s20
-; SDAG-NEXT: v_mov_b32_e32 v3, s21
-; SDAG-NEXT: v_mov_b32_e32 v4, s22
-; SDAG-NEXT: v_mov_b32_e32 v5, s23
-; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; SDAG-NEXT: s_nop 6
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v2, s16
-; SDAG-NEXT: v_mov_b32_e32 v3, s17
-; SDAG-NEXT: v_mov_b32_e32 v4, s18
-; SDAG-NEXT: v_mov_b32_e32 v5, s19
-; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v2, s12
-; SDAG-NEXT: v_mov_b32_e32 v3, s13
-; SDAG-NEXT: v_mov_b32_e32 v4, s14
-; SDAG-NEXT: v_mov_b32_e32 v5, s15
-; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v2, s8
-; SDAG-NEXT: v_mov_b32_e32 v3, s9
-; SDAG-NEXT: v_mov_b32_e32 v4, s10
-; SDAG-NEXT: v_mov_b32_e32 v5, s11
-; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -4717,52 +4580,44 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GISEL-NEXT: v_mov_b32_e32 v24, 0
+; GISEL-NEXT: v_mov_b32_e32 v56, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:1 abid:2 blgp:3
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
+; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:1 abid:2 blgp:3
+; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -4770,70 +4625,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT: v_mov_b32_e32 v0, 0
+; HEURRC-NEXT: v_mov_b32_e32 v40, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v2, s20
-; HEURRC-NEXT: v_mov_b32_e32 v3, s21
-; HEURRC-NEXT: v_mov_b32_e32 v4, s22
-; HEURRC-NEXT: v_mov_b32_e32 v5, s23
+; HEURRC-NEXT: v_mov_b32_e32 v32, s20
+; HEURRC-NEXT: v_mov_b32_e32 v33, s21
+; HEURRC-NEXT: v_mov_b32_e32 v34, s22
+; HEURRC-NEXT: v_mov_b32_e32 v35, s23
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b32_e32 v6, s24
-; HEURRC-NEXT: v_mov_b32_e32 v7, s25
-; HEURRC-NEXT: v_mov_b32_e32 v8, s26
-; HEURRC-NEXT: v_mov_b32_e32 v9, s27
+; HEURRC-NEXT: v_mov_b32_e32 v36, s24
+; HEURRC-NEXT: v_mov_b32_e32 v37, s25
+; HEURRC-NEXT: v_mov_b32_e32 v38, s26
+; HEURRC-NEXT: v_mov_b32_e32 v39, s27
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23
-; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
+; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] cbsz:1 abid:2 blgp:3
-; HEURRC-NEXT: v_mov_b32_e32 v2, s20
-; HEURRC-NEXT: v_mov_b32_e32 v3, s21
-; HEURRC-NEXT: v_mov_b32_e32 v4, s22
-; HEURRC-NEXT: v_mov_b32_e32 v5, s23
-; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; HEURRC-NEXT: s_nop 6
+; HEURRC-NEXT: v_mov_b32_e32 v16, s20
+; HEURRC-NEXT: v_mov_b32_e32 v17, s21
+; HEURRC-NEXT: v_mov_b32_e32 v18, s22
+; HEURRC-NEXT: v_mov_b32_e32 v19, s23
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v2, s16
-; HEURRC-NEXT: v_mov_b32_e32 v3, s17
-; HEURRC-NEXT: v_mov_b32_e32 v4, s18
-; HEURRC-NEXT: v_mov_b32_e32 v5, s19
-; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s16
+; HEURRC-NEXT: v_mov_b32_e32 v17, s17
+; HEURRC-NEXT: v_mov_b32_e32 v18, s18
+; HEURRC-NEXT: v_mov_b32_e32 v19, s19
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v2, s12
-; HEURRC-NEXT: v_mov_b32_e32 v3, s13
-; HEURRC-NEXT: v_mov_b32_e32 v4, s14
-; HEURRC-NEXT: v_mov_b32_e32 v5, s15
-; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s12
+; HEURRC-NEXT: v_mov_b32_e32 v17, s13
+; HEURRC-NEXT: v_mov_b32_e32 v18, s14
+; HEURRC-NEXT: v_mov_b32_e32 v19, s15
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v2, s8
-; HEURRC-NEXT: v_mov_b32_e32 v3, s9
-; HEURRC-NEXT: v_mov_b32_e32 v4, s10
-; HEURRC-NEXT: v_mov_b32_e32 v5, s11
-; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s8
+; HEURRC-NEXT: v_mov_b32_e32 v17, s9
+; HEURRC-NEXT: v_mov_b32_e32 v18, s10
+; HEURRC-NEXT: v_mov_b32_e32 v19, s11
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -4841,17 +4689,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; VGPRRC-NEXT: v_mov_b32_e32 v32, 0
+; VGPRRC-NEXT: v_mov_b32_e32 v40, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v34, s20
-; VGPRRC-NEXT: v_mov_b32_e32 v35, s21
-; VGPRRC-NEXT: v_mov_b32_e32 v36, s22
-; VGPRRC-NEXT: v_mov_b32_e32 v37, s23
+; VGPRRC-NEXT: v_mov_b32_e32 v32, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v33, s21
+; VGPRRC-NEXT: v_mov_b32_e32 v34, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v35, s23
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT: v_mov_b32_e32 v38, s24
-; VGPRRC-NEXT: v_mov_b32_e32 v39, s25
-; VGPRRC-NEXT: v_mov_b32_e32 v40, s26
-; VGPRRC-NEXT: v_mov_b32_e32 v41, s27
+; VGPRRC-NEXT: v_mov_b32_e32 v36, s24
+; VGPRRC-NEXT: v_mov_b32_e32 v37, s25
+; VGPRRC-NEXT: v_mov_b32_e32 v38, s26
+; VGPRRC-NEXT: v_mov_b32_e32 v39, s27
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
@@ -4862,42 +4710,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; VGPRRC-NEXT: s_nop 1
-; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[34:37], v[38:41], v[16:31] cbsz:1 abid:2 blgp:3
+; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
; VGPRRC-NEXT: s_nop 6
; VGPRRC-NEXT: v_mov_b32_e32 v16, s20
; VGPRRC-NEXT: v_mov_b32_e32 v17, s21
; VGPRRC-NEXT: v_mov_b32_e32 v18, s22
; VGPRRC-NEXT: v_mov_b32_e32 v19, s23
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s16
; VGPRRC-NEXT: v_mov_b32_e32 v17, s17
; VGPRRC-NEXT: v_mov_b32_e32 v18, s18
; VGPRRC-NEXT: v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s12
; VGPRRC-NEXT: v_mov_b32_e32 v17, s13
; VGPRRC-NEXT: v_mov_b32_e32 v18, s14
; VGPRRC-NEXT: v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s8
; VGPRRC-NEXT: v_mov_b32_e32 v17, s9
; VGPRRC-NEXT: v_mov_b32_e32 v18, s10
; VGPRRC-NEXT: v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags:
@@ -5045,41 +4893,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b32_e32 v4, s24
-; SDAG-NEXT: v_mov_b32_e32 v5, s25
-; SDAG-NEXT: v_mov_b32_e32 v6, s26
-; SDAG-NEXT: v_mov_b32_e32 v7, s27
+; SDAG-NEXT: v_mov_b32_e32 v20, s24
+; SDAG-NEXT: v_mov_b32_e32 v21, s25
+; SDAG-NEXT: v_mov_b32_e32 v22, s26
+; SDAG-NEXT: v_mov_b32_e32 v23, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
@@ -5088,35 +4928,27 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
@@ -5124,41 +4956,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
+; HEURRC-NEXT: v_mov_b32_e32 v16, s20
+; HEURRC-NEXT: v_mov_b32_e32 v17, s21
+; HEURRC-NEXT: v_mov_b32_e32 v18, s22
+; HEURRC-NEXT: v_mov_b32_e32 v19, s23
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b32_e32 v4, s24
-; HEURRC-NEXT: v_mov_b32_e32 v5, s25
-; HEURRC-NEXT: v_mov_b32_e32 v6, s26
-; HEURRC-NEXT: v_mov_b32_e32 v7, s27
+; HEURRC-NEXT: v_mov_b32_e32 v20, s24
+; HEURRC-NEXT: v_mov_b32_e32 v21, s25
+; HEURRC-NEXT: v_mov_b32_e32 v22, s26
+; HEURRC-NEXT: v_mov_b32_e32 v23, s27
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
-; HEURRC-NEXT: v_mov_b32_e32 v0, 0
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
+; HEURRC-NEXT: v_mov_b32_e32 v16, 0
; HEURRC-NEXT: s_nop 7
; HEURRC-NEXT: s_nop 2
-; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
@@ -5279,41 +5103,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b32_e32 v4, s24
-; SDAG-NEXT: v_mov_b32_e32 v5, s25
-; SDAG-NEXT: v_mov_b32_e32 v6, s26
-; SDAG-NEXT: v_mov_b32_e32 v7, s27
+; SDAG-NEXT: v_mov_b32_e32 v20, s24
+; SDAG-NEXT: v_mov_b32_e32 v21, s25
+; SDAG-NEXT: v_mov_b32_e32 v22, s26
+; SDAG-NEXT: v_mov_b32_e32 v23, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
@@ -5322,35 +5138,27 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
@@ -5358,41 +5166,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
+; HEURRC-NEXT: v_mov_b32_e32 v16, s20
+; HEURRC-NEXT: v_mov_b32_e32 v17, s21
+; HEURRC-NEXT: v_mov_b32_e32 v18, s22
+; HEURRC-NEXT: v_mov_b32_e32 v19, s23
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b32_e32 v4, s24
-; HEURRC-NEXT: v_mov_b32_e32 v5, s25
-; HEURRC-NEXT: v_mov_b32_e32 v6, s26
-; HEURRC-NEXT: v_mov_b32_e32 v7, s27
+; HEURRC-NEXT: v_mov_b32_e32 v20, s24
+; HEURRC-NEXT: v_mov_b32_e32 v21, s25
+; HEURRC-NEXT: v_mov_b32_e32 v22, s26
+; HEURRC-NEXT: v_mov_b32_e32 v23, s27
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; HEURRC-NEXT: v_mov_b32_e32 v0, 0
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: v_mov_b32_e32 v16, 0
; HEURRC-NEXT: s_nop 7
; HEURRC-NEXT: s_nop 2
-; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
@@ -5643,20 +5443,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GCN-NEXT: v_mov_b32_e32 v8, 0
+; GCN-NEXT: v_mov_b32_e32 v12, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GCN-NEXT: v_accvgpr_write_b32 a0, s0
+; GCN-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GCN-NEXT: v_accvgpr_write_b32 a1, s1
-; GCN-NEXT: v_accvgpr_write_b32 a2, s2
-; GCN-NEXT: v_accvgpr_write_b32 a3, s3
+; GCN-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
; GCN-NEXT: s_nop 7
-; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; GCN-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
@@ -5664,20 +5462,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v12, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
@@ -5747,20 +5543,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GCN-NEXT: v_mov_b32_e32 v8, 0
+; GCN-NEXT: v_mov_b32_e32 v12, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GCN-NEXT: v_accvgpr_write_b32 a0, s0
+; GCN-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GCN-NEXT: v_accvgpr_write_b32 a1, s1
-; GCN-NEXT: v_accvgpr_write_b32 a2, s2
-; GCN-NEXT: v_accvgpr_write_b32 a3, s3
+; GCN-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
; GCN-NEXT: s_nop 7
-; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; GCN-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
@@ -5768,20 +5562,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v12, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
@@ -5845,5 +5637,5 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
ret void
}
-attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
+attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" }
attributes #1 = { "amdgpu-flat-work-group-size"="1,64" }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
index 37809da..f78ea92 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
@@ -1895,36 +1895,36 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v2, s8
-; SDAG-NEXT: v_mov_b32_e32 v3, s9
-; SDAG-NEXT: v_mov_b32_e32 v4, s10
-; SDAG-NEXT: v_mov_b32_e32 v5, s11
-; SDAG-NEXT: v_mov_b32_e32 v6, s12
-; SDAG-NEXT: v_mov_b32_e32 v7, s13
-; SDAG-NEXT: v_mov_b32_e32 v8, s14
-; SDAG-NEXT: v_mov_b32_e32 v9, s15
+; SDAG-NEXT: v_mov_b32_e32 v0, s8
+; SDAG-NEXT: v_mov_b32_e32 v1, s9
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: v_mov_b32_e32 v4, s12
+; SDAG-NEXT: v_mov_b32_e32 v5, s13
+; SDAG-NEXT: v_mov_b32_e32 v6, s14
+; SDAG-NEXT: v_mov_b32_e32 v7, s15
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v10, s16
-; SDAG-NEXT: v_mov_b32_e32 v11, s17
-; SDAG-NEXT: v_mov_b32_e32 v12, s18
-; SDAG-NEXT: v_mov_b32_e32 v13, s19
-; SDAG-NEXT: v_mov_b32_e32 v14, s20
-; SDAG-NEXT: v_mov_b32_e32 v15, s21
-; SDAG-NEXT: v_mov_b32_e32 v16, s22
-; SDAG-NEXT: v_mov_b32_e32 v17, s23
+; SDAG-NEXT: v_mov_b32_e32 v8, s16
+; SDAG-NEXT: v_mov_b32_e32 v9, s17
+; SDAG-NEXT: v_mov_b32_e32 v10, s18
+; SDAG-NEXT: v_mov_b32_e32 v11, s19
+; SDAG-NEXT: v_mov_b32_e32 v12, s20
+; SDAG-NEXT: v_mov_b32_e32 v13, s21
+; SDAG-NEXT: v_mov_b32_e32 v14, s22
+; SDAG-NEXT: v_mov_b32_e32 v15, s23
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: v_mov_b32_e32 v21, s13
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], s12, v1 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s12, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[14:15]
+; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[14:15]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd:
@@ -1937,20 +1937,18 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s24
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s25
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s26
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s27
-; GISEL-NEXT: v_mov_b32_e32 v16, s29
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT: v_mov_b32_e32 v20, s29
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s28, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s28, v20 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[30:31]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1)
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -1964,40 +1962,38 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
; SDAG-NEXT: s_movk_i32 s6, 0x41
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v2, s8
-; SDAG-NEXT: v_mov_b32_e32 v3, s9
-; SDAG-NEXT: v_mov_b32_e32 v4, s10
-; SDAG-NEXT: v_mov_b32_e32 v5, s11
-; SDAG-NEXT: v_mov_b32_e32 v6, s12
-; SDAG-NEXT: v_mov_b32_e32 v7, s13
-; SDAG-NEXT: v_mov_b32_e32 v8, s14
-; SDAG-NEXT: v_mov_b32_e32 v9, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT: v_mov_b32_e32 v10, s16
-; SDAG-NEXT: v_mov_b32_e32 v11, s17
-; SDAG-NEXT: v_mov_b32_e32 v12, s18
-; SDAG-NEXT: v_mov_b32_e32 v13, s19
-; SDAG-NEXT: v_mov_b32_e32 v14, s20
-; SDAG-NEXT: v_mov_b32_e32 v15, s21
-; SDAG-NEXT: v_mov_b32_e32 v16, s22
-; SDAG-NEXT: v_mov_b32_e32 v17, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, s8
+; SDAG-NEXT: v_mov_b32_e32 v1, s9
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: v_mov_b32_e32 v4, s12
+; SDAG-NEXT: v_mov_b32_e32 v5, s13
+; SDAG-NEXT: v_mov_b32_e32 v6, s14
+; SDAG-NEXT: v_mov_b32_e32 v7, s15
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
+; SDAG-NEXT: v_mov_b32_e32 v8, s16
+; SDAG-NEXT: v_mov_b32_e32 v9, s17
+; SDAG-NEXT: v_mov_b32_e32 v10, s18
+; SDAG-NEXT: v_mov_b32_e32 v11, s19
+; SDAG-NEXT: v_mov_b32_e32 v12, s20
+; SDAG-NEXT: v_mov_b32_e32 v13, s21
+; SDAG-NEXT: v_mov_b32_e32 v14, s22
+; SDAG-NEXT: v_mov_b32_e32 v15, s23
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
+; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
-; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v20, 0x41
; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
@@ -2005,19 +2001,17 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2)
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -2031,40 +2025,38 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
; SDAG-NEXT: s_movk_i32 s6, 0x41
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v2, s8
-; SDAG-NEXT: v_mov_b32_e32 v3, s9
-; SDAG-NEXT: v_mov_b32_e32 v4, s10
-; SDAG-NEXT: v_mov_b32_e32 v5, s11
-; SDAG-NEXT: v_mov_b32_e32 v6, s12
-; SDAG-NEXT: v_mov_b32_e32 v7, s13
-; SDAG-NEXT: v_mov_b32_e32 v8, s14
-; SDAG-NEXT: v_mov_b32_e32 v9, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT: v_mov_b32_e32 v10, s16
-; SDAG-NEXT: v_mov_b32_e32 v11, s17
-; SDAG-NEXT: v_mov_b32_e32 v12, s18
-; SDAG-NEXT: v_mov_b32_e32 v13, s19
-; SDAG-NEXT: v_mov_b32_e32 v14, s20
-; SDAG-NEXT: v_mov_b32_e32 v15, s21
-; SDAG-NEXT: v_mov_b32_e32 v16, s22
-; SDAG-NEXT: v_mov_b32_e32 v17, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, s8
+; SDAG-NEXT: v_mov_b32_e32 v1, s9
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: v_mov_b32_e32 v4, s12
+; SDAG-NEXT: v_mov_b32_e32 v5, s13
+; SDAG-NEXT: v_mov_b32_e32 v6, s14
+; SDAG-NEXT: v_mov_b32_e32 v7, s15
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
+; SDAG-NEXT: v_mov_b32_e32 v8, s16
+; SDAG-NEXT: v_mov_b32_e32 v9, s17
+; SDAG-NEXT: v_mov_b32_e32 v10, s18
+; SDAG-NEXT: v_mov_b32_e32 v11, s19
+; SDAG-NEXT: v_mov_b32_e32 v12, s20
+; SDAG-NEXT: v_mov_b32_e32 v13, s21
+; SDAG-NEXT: v_mov_b32_e32 v14, s22
+; SDAG-NEXT: v_mov_b32_e32 v15, s23
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
+; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
-; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v20, 0x41
; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
@@ -2072,19 +2064,17 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 1065353216)
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -2096,34 +2086,32 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v2, s8
-; SDAG-NEXT: v_mov_b32_e32 v3, s9
-; SDAG-NEXT: v_mov_b32_e32 v4, s10
-; SDAG-NEXT: v_mov_b32_e32 v5, s11
-; SDAG-NEXT: v_mov_b32_e32 v6, s12
-; SDAG-NEXT: v_mov_b32_e32 v7, s13
-; SDAG-NEXT: v_mov_b32_e32 v8, s14
-; SDAG-NEXT: v_mov_b32_e32 v9, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT: v_mov_b32_e32 v10, s16
-; SDAG-NEXT: v_mov_b32_e32 v11, s17
-; SDAG-NEXT: v_mov_b32_e32 v12, s18
-; SDAG-NEXT: v_mov_b32_e32 v13, s19
-; SDAG-NEXT: v_mov_b32_e32 v14, s20
-; SDAG-NEXT: v_mov_b32_e32 v15, s21
-; SDAG-NEXT: v_mov_b32_e32 v16, s22
-; SDAG-NEXT: v_mov_b32_e32 v17, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, s8
+; SDAG-NEXT: v_mov_b32_e32 v1, s9
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: v_mov_b32_e32 v4, s12
+; SDAG-NEXT: v_mov_b32_e32 v5, s13
+; SDAG-NEXT: v_mov_b32_e32 v6, s14
+; SDAG-NEXT: v_mov_b32_e32 v7, s15
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
+; SDAG-NEXT: v_mov_b32_e32 v8, s16
+; SDAG-NEXT: v_mov_b32_e32 v9, s17
+; SDAG-NEXT: v_mov_b32_e32 v10, s18
+; SDAG-NEXT: v_mov_b32_e32 v11, s19
+; SDAG-NEXT: v_mov_b32_e32 v12, s20
+; SDAG-NEXT: v_mov_b32_e32 v13, s21
+; SDAG-NEXT: v_mov_b32_e32 v14, s22
+; SDAG-NEXT: v_mov_b32_e32 v15, s23
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
+; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm:
@@ -2136,21 +2124,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 -2)
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -2162,34 +2148,32 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v2, s8
-; SDAG-NEXT: v_mov_b32_e32 v3, s9
-; SDAG-NEXT: v_mov_b32_e32 v4, s10
-; SDAG-NEXT: v_mov_b32_e32 v5, s11
-; SDAG-NEXT: v_mov_b32_e32 v6, s12
-; SDAG-NEXT: v_mov_b32_e32 v7, s13
-; SDAG-NEXT: v_mov_b32_e32 v8, s14
-; SDAG-NEXT: v_mov_b32_e32 v9, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT: v_mov_b32_e32 v10, s16
-; SDAG-NEXT: v_mov_b32_e32 v11, s17
-; SDAG-NEXT: v_mov_b32_e32 v12, s18
-; SDAG-NEXT: v_mov_b32_e32 v13, s19
-; SDAG-NEXT: v_mov_b32_e32 v14, s20
-; SDAG-NEXT: v_mov_b32_e32 v15, s21
-; SDAG-NEXT: v_mov_b32_e32 v16, s22
-; SDAG-NEXT: v_mov_b32_e32 v17, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, s8
+; SDAG-NEXT: v_mov_b32_e32 v1, s9
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: v_mov_b32_e32 v4, s12
+; SDAG-NEXT: v_mov_b32_e32 v5, s13
+; SDAG-NEXT: v_mov_b32_e32 v6, s14
+; SDAG-NEXT: v_mov_b32_e32 v7, s15
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
+; SDAG-NEXT: v_mov_b32_e32 v8, s16
+; SDAG-NEXT: v_mov_b32_e32 v9, s17
+; SDAG-NEXT: v_mov_b32_e32 v10, s18
+; SDAG-NEXT: v_mov_b32_e32 v11, s19
+; SDAG-NEXT: v_mov_b32_e32 v12, s20
+; SDAG-NEXT: v_mov_b32_e32 v13, s21
+; SDAG-NEXT: v_mov_b32_e32 v14, s22
+; SDAG-NEXT: v_mov_b32_e32 v15, s23
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
+; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal:
@@ -2202,21 +2186,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
; GISEL-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 1042479491)
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -2559,5 +2541,5 @@ declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6
declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
-attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
+attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" }
attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index bc50058..0b2818f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -4539,49 +4539,41 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32>
; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s36
-; SDAG-NEXT: v_mov_b32_e32 v2, s8
-; SDAG-NEXT: v_mov_b32_e32 v3, s9
-; SDAG-NEXT: v_mov_b32_e32 v4, s10
-; SDAG-NEXT: v_mov_b32_e32 v5, s11
-; SDAG-NEXT: v_mov_b32_e32 v6, s12
-; SDAG-NEXT: v_mov_b32_e32 v7, s13
-; SDAG-NEXT: v_mov_b32_e32 v8, s14
-; SDAG-NEXT: v_mov_b32_e32 v9, s15
-; SDAG-NEXT: v_mov_b32_e32 v10, s16
-; SDAG-NEXT: v_mov_b32_e32 v11, s17
-; SDAG-NEXT: v_mov_b32_e32 v12, s18
-; SDAG-NEXT: v_mov_b32_e32 v13, s19
-; SDAG-NEXT: v_mov_b32_e32 v14, s20
-; SDAG-NEXT: v_mov_b32_e32 v15, s21
-; SDAG-NEXT: v_mov_b32_e32 v16, s22
-; SDAG-NEXT: v_mov_b32_e32 v17, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s37
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s38
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s39
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s40
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s41
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s42
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s43
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s44
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s45
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s46
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s47
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s48
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s49
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s50
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s51
-; SDAG-NEXT: v_mov_b32_e32 v0, s1
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: v_mov_b32_e32 v20, s12
+; SDAG-NEXT: v_mov_b32_e32 v21, s13
+; SDAG-NEXT: v_mov_b32_e32 v22, s14
+; SDAG-NEXT: v_mov_b32_e32 v23, s15
+; SDAG-NEXT: v_mov_b32_e32 v24, s16
+; SDAG-NEXT: v_mov_b32_e32 v25, s17
+; SDAG-NEXT: v_mov_b32_e32 v26, s18
+; SDAG-NEXT: v_mov_b32_e32 v27, s19
+; SDAG-NEXT: v_mov_b32_e32 v28, s20
+; SDAG-NEXT: v_mov_b32_e32 v29, s21
+; SDAG-NEXT: v_mov_b32_e32 v30, s22
+; SDAG-NEXT: v_mov_b32_e32 v31, s23
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; SDAG-NEXT: v_mov_b32_e32 v32, s1
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], s0, v0 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s0, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48
-; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32
-; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3]
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd:
@@ -4590,41 +4582,33 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32>
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s36
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s37
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s38
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s39
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s40
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s41
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s42
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s43
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s44
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s45
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s46
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s47
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s48
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s49
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s50
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s51
-; GISEL-NEXT: v_mov_b32_e32 v16, s1
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
+; GISEL-NEXT: v_mov_b32_e32 v32, s1
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s0, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3]
-; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16
-; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32
-; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
; GISEL-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1)
store <16 x float> %result, ptr addrspace(1) %ptr, align 64
@@ -4639,91 +4623,75 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_
; SDAG-NEXT: s_movk_i32 s2, 0x41
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: v_mov_b32_e32 v4, s12
-; SDAG-NEXT: v_mov_b32_e32 v5, s13
-; SDAG-NEXT: v_mov_b32_e32 v6, s14
-; SDAG-NEXT: v_mov_b32_e32 v7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s36
-; SDAG-NEXT: v_mov_b32_e32 v8, s16
-; SDAG-NEXT: v_mov_b32_e32 v9, s17
-; SDAG-NEXT: v_mov_b32_e32 v10, s18
-; SDAG-NEXT: v_mov_b32_e32 v11, s19
-; SDAG-NEXT: v_mov_b32_e32 v12, s20
-; SDAG-NEXT: v_mov_b32_e32 v13, s21
-; SDAG-NEXT: v_mov_b32_e32 v14, s22
-; SDAG-NEXT: v_mov_b32_e32 v15, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s37
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s38
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s39
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s40
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s41
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s42
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s43
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s44
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s45
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s46
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s47
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s48
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s49
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s50
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s51
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: v_mov_b32_e32 v20, s12
+; SDAG-NEXT: v_mov_b32_e32 v21, s13
+; SDAG-NEXT: v_mov_b32_e32 v22, s14
+; SDAG-NEXT: v_mov_b32_e32 v23, s15
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
+; SDAG-NEXT: v_mov_b32_e32 v24, s16
+; SDAG-NEXT: v_mov_b32_e32 v25, s17
+; SDAG-NEXT: v_mov_b32_e32 v26, s18
+; SDAG-NEXT: v_mov_b32_e32 v27, s19
+; SDAG-NEXT: v_mov_b32_e32 v28, s20
+; SDAG-NEXT: v_mov_b32_e32 v29, s21
+; SDAG-NEXT: v_mov_b32_e32 v30, s22
+; SDAG-NEXT: v_mov_b32_e32 v31, s23
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s2, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s2, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40
-; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v32, 0x41
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s36
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s37
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s38
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s39
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s40
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s41
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s42
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s43
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s44
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s45
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s46
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s47
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s48
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s49
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s50
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s51
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 65, i32 1, i32 -2)
store <16 x float> %result, ptr addrspace(1) %ptr, align 64
@@ -5031,77 +4999,72 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: v_mov_b32_e32 v4, s16
-; SDAG-NEXT: v_mov_b32_e32 v5, s17
-; SDAG-NEXT: v_mov_b32_e32 v6, s18
-; SDAG-NEXT: v_mov_b32_e32 v7, s19
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
-; SDAG-NEXT: v_mov_b32_e32 v9, s21
-; SDAG-NEXT: v_mov_b32_e32 v10, s22
-; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: v_mov_b32_e32 v32, s12
+; SDAG-NEXT: v_mov_b32_e32 v33, s13
+; SDAG-NEXT: v_mov_b32_e32 v34, s14
+; SDAG-NEXT: v_mov_b32_e32 v35, s15
+; SDAG-NEXT: v_mov_b32_e32 v36, s16
+; SDAG-NEXT: v_mov_b32_e32 v37, s17
+; SDAG-NEXT: v_mov_b32_e32 v38, s18
+; SDAG-NEXT: v_mov_b32_e32 v39, s19
+; SDAG-NEXT: v_mov_b32_e32 v40, s20
+; SDAG-NEXT: v_mov_b32_e32 v41, s21
+; SDAG-NEXT: v_mov_b32_e32 v42, s22
+; SDAG-NEXT: v_mov_b32_e32 v43, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
-; SDAG-NEXT: v_mov_b32_e32 v13, s25
-; SDAG-NEXT: v_mov_b32_e32 v14, s26
-; SDAG-NEXT: v_mov_b32_e32 v15, s27
+; SDAG-NEXT: v_mov_b32_e32 v44, s24
+; SDAG-NEXT: v_mov_b32_e32 v45, s25
+; SDAG-NEXT: v_mov_b32_e32 v46, s26
+; SDAG-NEXT: v_mov_b32_e32 v47, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a29, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a28, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a27, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a26, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a25, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a24, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a23, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a22, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a21, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a20, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a19, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
+; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[16:31] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v2, s20
-; SDAG-NEXT: v_mov_b32_e32 v3, s21
-; SDAG-NEXT: v_mov_b32_e32 v4, s22
-; SDAG-NEXT: v_mov_b32_e32 v5, s23
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
-; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1
+; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: s_nop 6
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48
+; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v6, s18
-; SDAG-NEXT: v_mov_b32_e32 v7, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s16
-; SDAG-NEXT: v_mov_b32_e32 v5, s17
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
-; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s14
-; SDAG-NEXT: v_mov_b32_e32 v9, s15
-; SDAG-NEXT: v_mov_b32_e32 v6, s12
-; SDAG-NEXT: v_mov_b32_e32 v7, s13
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
-; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v10, s10
-; SDAG-NEXT: v_mov_b32_e32 v11, s11
-; SDAG-NEXT: v_mov_b32_e32 v8, s8
-; SDAG-NEXT: v_mov_b32_e32 v9, s9
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
-; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -5109,61 +5072,45 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
-; GISEL-NEXT: v_accvgpr_write_b32 a31, s23
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
-; GISEL-NEXT: v_accvgpr_write_b32 a30, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a29, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a28, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a27, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a26, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a25, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a24, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a23, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a22, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a21, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a20, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a19, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a18, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a17, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a16, s8
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[16:31] blgp:2
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48
+; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0)
@@ -5177,77 +5124,70 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: v_mov_b32_e32 v4, s16
-; SDAG-NEXT: v_mov_b32_e32 v5, s17
-; SDAG-NEXT: v_mov_b32_e32 v6, s18
-; SDAG-NEXT: v_mov_b32_e32 v7, s19
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
-; SDAG-NEXT: v_mov_b32_e32 v9, s21
-; SDAG-NEXT: v_mov_b32_e32 v10, s22
-; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: v_mov_b32_e32 v20, s16
+; SDAG-NEXT: v_mov_b32_e32 v21, s17
+; SDAG-NEXT: v_mov_b32_e32 v22, s18
+; SDAG-NEXT: v_mov_b32_e32 v23, s19
+; SDAG-NEXT: v_mov_b32_e32 v24, s20
+; SDAG-NEXT: v_mov_b32_e32 v25, s21
+; SDAG-NEXT: v_mov_b32_e32 v26, s22
+; SDAG-NEXT: v_mov_b32_e32 v27, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
-; SDAG-NEXT: v_mov_b32_e32 v13, s25
-; SDAG-NEXT: v_mov_b32_e32 v14, s26
-; SDAG-NEXT: v_mov_b32_e32 v15, s27
+; SDAG-NEXT: v_mov_b32_e32 v28, s24
+; SDAG-NEXT: v_mov_b32_e32 v29, s25
+; SDAG-NEXT: v_mov_b32_e32 v30, s26
+; SDAG-NEXT: v_mov_b32_e32 v31, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v2, s20
-; SDAG-NEXT: v_mov_b32_e32 v3, s21
-; SDAG-NEXT: v_mov_b32_e32 v4, s22
-; SDAG-NEXT: v_mov_b32_e32 v5, s23
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
-; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48
+; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v6, s18
-; SDAG-NEXT: v_mov_b32_e32 v7, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s16
-; SDAG-NEXT: v_mov_b32_e32 v5, s17
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
-; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s14
-; SDAG-NEXT: v_mov_b32_e32 v9, s15
-; SDAG-NEXT: v_mov_b32_e32 v6, s12
-; SDAG-NEXT: v_mov_b32_e32 v7, s13
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
-; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v10, s10
-; SDAG-NEXT: v_mov_b32_e32 v11, s11
-; SDAG-NEXT: v_mov_b32_e32 v8, s8
-; SDAG-NEXT: v_mov_b32_e32 v9, s9
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
-; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -5255,61 +5195,53 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[44:45]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[50:51]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42)
@@ -6298,6 +6230,6 @@ declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6
declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
-attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
+attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" }
attributes #1 = { "amdgpu-flat-work-group-size"="128,128" }
attributes #2 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
index ea9334a..31a48de 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
@@ -1,8 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-SDAG %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX942-STRESS,GFX942-SDAG-STRESS %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX942-STRESS,GFX942-GISEL-STRESS %s
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float>, <2 x float>, <4 x float>, i32, i32, i32)
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float>, <2 x float>, <16 x float>, i32, i32, i32)
@@ -51,50 +49,6 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(ptr addrspace(1) %arg) #0 {
; GFX942-GISEL-NEXT: s_nop 5
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
-;
-; GFX942-SDAG-STRESS-LABEL: test_mfma_f32_16x16x8xf32:
-; GFX942-SDAG-STRESS: ; %bb.0: ; %bb
-; GFX942-SDAG-STRESS-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v1, 2.0
-; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v2, 0x40400000
-; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v3, 4.0
-; GFX942-SDAG-STRESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-STRESS-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-SDAG-STRESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-SDAG-STRESS-NEXT: s_nop 1
-; GFX942-SDAG-STRESS-NEXT: v_mfma_f32_16x16x8_xf32 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-STRESS-NEXT: s_nop 6
-; GFX942-SDAG-STRESS-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-SDAG-STRESS-NEXT: s_endpgm
-;
-; GFX942-GISEL-STRESS-LABEL: test_mfma_f32_16x16x8xf32:
-; GFX942-GISEL-STRESS: ; %bb.0: ; %bb
-; GFX942-GISEL-STRESS-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-GISEL-STRESS-NEXT: s_mov_b32 s0, 1.0
-; GFX942-GISEL-STRESS-NEXT: s_mov_b32 s2, 0x40400000
-; GFX942-GISEL-STRESS-NEXT: s_mov_b32 s1, 2.0
-; GFX942-GISEL-STRESS-NEXT: s_mov_b32 s3, 4.0
-; GFX942-GISEL-STRESS-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-GISEL-STRESS-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-GISEL-STRESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-STRESS-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-GISEL-STRESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-GISEL-STRESS-NEXT: s_nop 1
-; GFX942-GISEL-STRESS-NEXT: v_mfma_f32_16x16x8_xf32 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-STRESS-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-GISEL-STRESS-NEXT: s_nop 5
-; GFX942-GISEL-STRESS-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX942-GISEL-STRESS-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -178,82 +132,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 {
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
; GFX942-GISEL-NEXT: s_endpgm
-;
-; GFX942-SDAG-STRESS-LABEL: test_mfma_f32_32x32x4xf32:
-; GFX942-SDAG-STRESS: ; %bb.0: ; %bb
-; GFX942-SDAG-STRESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v1, 2.0
-; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v2, 0x40400000
-; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v3, 4.0
-; GFX942-SDAG-STRESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-STRESS-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-SDAG-STRESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a15, s15
-; GFX942-SDAG-STRESS-NEXT: s_nop 1
-; GFX942-SDAG-STRESS-NEXT: v_mfma_f32_32x32x4_xf32 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-SDAG-STRESS-NEXT: s_nop 7
-; GFX942-SDAG-STRESS-NEXT: s_nop 1
-; GFX942-SDAG-STRESS-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GFX942-SDAG-STRESS-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX942-SDAG-STRESS-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX942-SDAG-STRESS-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
-; GFX942-SDAG-STRESS-NEXT: s_endpgm
-;
-; GFX942-GISEL-STRESS-LABEL: test_mfma_f32_32x32x4xf32:
-; GFX942-GISEL-STRESS: ; %bb.0: ; %bb
-; GFX942-GISEL-STRESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-GISEL-STRESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-STRESS-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-GISEL-STRESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a15, s15
-; GFX942-GISEL-STRESS-NEXT: s_mov_b32 s0, 1.0
-; GFX942-GISEL-STRESS-NEXT: s_mov_b32 s1, 2.0
-; GFX942-GISEL-STRESS-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-GISEL-STRESS-NEXT: s_mov_b32 s0, 0x40400000
-; GFX942-GISEL-STRESS-NEXT: s_mov_b32 s1, 4.0
-; GFX942-GISEL-STRESS-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
-; GFX942-GISEL-STRESS-NEXT: s_nop 1
-; GFX942-GISEL-STRESS-NEXT: v_mfma_f32_32x32x4_xf32 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-STRESS-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-GISEL-STRESS-NEXT: s_nop 7
-; GFX942-GISEL-STRESS-NEXT: s_nop 1
-; GFX942-GISEL-STRESS-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
-; GFX942-GISEL-STRESS-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX942-GISEL-STRESS-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX942-GISEL-STRESS-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GFX942-GISEL-STRESS-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <16 x float> %in.1, i32 1, i32 2, i32 3)
@@ -264,4 +142,3 @@ bb:
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX942: {{.*}}
-; GFX942-STRESS: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 8056881..b25fe83 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -17,24 +17,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; SDAG-NEXT: v_mov_b32_e32 v12, 0
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT: v_mov_b32_e32 v13, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__vgpr:
@@ -547,24 +547,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1)
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GCN-NEXT: v_mov_b32_e32 v12, 0
+; GCN-NEXT: v_mov_b32_e32 v16, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
+; GCN-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; GCN-NEXT: s_load_dword s16, s[4:5], 0x64
-; GCN-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
-; GCN-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GCN-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
+; GCN-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GCN-NEXT: v_mov_b32_e32 v13, s16
+; GCN-NEXT: v_mov_b32_e32 v17, s16
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2
+; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
; GCN-NEXT: s_nop 7
-; GCN-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7]
+; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
; GCN-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -855,30 +855,30 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
-; SDAG-NEXT: v_mov_b32_e32 v14, s8
-; SDAG-NEXT: v_mov_b32_e32 v15, s9
-; SDAG-NEXT: v_mov_b32_e32 v16, s10
-; SDAG-NEXT: v_mov_b32_e32 v17, s11
-; SDAG-NEXT: v_mov_b32_e32 v2, s12
-; SDAG-NEXT: v_mov_b32_e32 v3, s13
-; SDAG-NEXT: v_mov_b32_e32 v4, s14
-; SDAG-NEXT: v_mov_b32_e32 v5, s15
+; SDAG-NEXT: v_mov_b32_e32 v12, s8
+; SDAG-NEXT: v_mov_b32_e32 v13, s9
+; SDAG-NEXT: v_mov_b32_e32 v14, s10
+; SDAG-NEXT: v_mov_b32_e32 v15, s11
+; SDAG-NEXT: v_mov_b32_e32 v0, s12
+; SDAG-NEXT: v_mov_b32_e32 v1, s13
+; SDAG-NEXT: v_mov_b32_e32 v2, s14
+; SDAG-NEXT: v_mov_b32_e32 v3, s15
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v6, s0
-; SDAG-NEXT: v_mov_b32_e32 v7, s1
-; SDAG-NEXT: v_mov_b32_e32 v8, s2
-; SDAG-NEXT: v_mov_b32_e32 v9, s3
-; SDAG-NEXT: v_mov_b32_e32 v1, s16
+; SDAG-NEXT: v_mov_b32_e32 v4, s0
+; SDAG-NEXT: v_mov_b32_e32 v5, s1
+; SDAG-NEXT: v_mov_b32_e32 v6, s2
+; SDAG-NEXT: v_mov_b32_e32 v7, s3
+; SDAG-NEXT: v_mov_b32_e32 v17, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__vgpr:
@@ -1032,22 +1032,22 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v26, s8
-; SDAG-NEXT: v_mov_b32_e32 v27, s9
-; SDAG-NEXT: v_mov_b32_e32 v28, s10
-; SDAG-NEXT: v_mov_b32_e32 v29, s11
-; SDAG-NEXT: v_mov_b32_e32 v18, s12
-; SDAG-NEXT: v_mov_b32_e32 v19, s13
-; SDAG-NEXT: v_mov_b32_e32 v20, s14
-; SDAG-NEXT: v_mov_b32_e32 v21, s15
-; SDAG-NEXT: v_mov_b32_e32 v22, s0
-; SDAG-NEXT: v_mov_b32_e32 v23, s1
-; SDAG-NEXT: v_mov_b32_e32 v24, s2
-; SDAG-NEXT: v_mov_b32_e32 v25, s3
-; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v24, s8
+; SDAG-NEXT: v_mov_b32_e32 v25, s9
+; SDAG-NEXT: v_mov_b32_e32 v26, s10
+; SDAG-NEXT: v_mov_b32_e32 v27, s11
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: v_mov_b32_e32 v20, s0
+; SDAG-NEXT: v_mov_b32_e32 v21, s1
+; SDAG-NEXT: v_mov_b32_e32 v22, s2
+; SDAG-NEXT: v_mov_b32_e32 v23, s3
+; SDAG-NEXT: v_mov_b32_e32 v28, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
@@ -1397,30 +1397,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
-; SDAG-NEXT: v_mov_b32_e32 v14, s8
-; SDAG-NEXT: v_mov_b32_e32 v15, s9
-; SDAG-NEXT: v_mov_b32_e32 v16, s10
-; SDAG-NEXT: v_mov_b32_e32 v17, s11
-; SDAG-NEXT: v_mov_b32_e32 v2, s12
-; SDAG-NEXT: v_mov_b32_e32 v3, s13
-; SDAG-NEXT: v_mov_b32_e32 v4, s14
-; SDAG-NEXT: v_mov_b32_e32 v5, s15
+; SDAG-NEXT: v_mov_b32_e32 v12, s8
+; SDAG-NEXT: v_mov_b32_e32 v13, s9
+; SDAG-NEXT: v_mov_b32_e32 v14, s10
+; SDAG-NEXT: v_mov_b32_e32 v15, s11
+; SDAG-NEXT: v_mov_b32_e32 v0, s12
+; SDAG-NEXT: v_mov_b32_e32 v1, s13
+; SDAG-NEXT: v_mov_b32_e32 v2, s14
+; SDAG-NEXT: v_mov_b32_e32 v3, s15
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v6, s0
-; SDAG-NEXT: v_mov_b32_e32 v7, s1
-; SDAG-NEXT: v_mov_b32_e32 v8, s2
-; SDAG-NEXT: v_mov_b32_e32 v9, s3
-; SDAG-NEXT: v_mov_b32_e32 v1, s16
+; SDAG-NEXT: v_mov_b32_e32 v4, s0
+; SDAG-NEXT: v_mov_b32_e32 v5, s1
+; SDAG-NEXT: v_mov_b32_e32 v6, s2
+; SDAG-NEXT: v_mov_b32_e32 v7, s3
+; SDAG-NEXT: v_mov_b32_e32 v17, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr:
@@ -1566,30 +1566,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
-; SDAG-NEXT: v_mov_b32_e32 v14, s8
-; SDAG-NEXT: v_mov_b32_e32 v15, s9
-; SDAG-NEXT: v_mov_b32_e32 v16, s10
-; SDAG-NEXT: v_mov_b32_e32 v17, s11
-; SDAG-NEXT: v_mov_b32_e32 v2, s12
-; SDAG-NEXT: v_mov_b32_e32 v3, s13
-; SDAG-NEXT: v_mov_b32_e32 v4, s14
-; SDAG-NEXT: v_mov_b32_e32 v5, s15
+; SDAG-NEXT: v_mov_b32_e32 v12, s8
+; SDAG-NEXT: v_mov_b32_e32 v13, s9
+; SDAG-NEXT: v_mov_b32_e32 v14, s10
+; SDAG-NEXT: v_mov_b32_e32 v15, s11
+; SDAG-NEXT: v_mov_b32_e32 v0, s12
+; SDAG-NEXT: v_mov_b32_e32 v1, s13
+; SDAG-NEXT: v_mov_b32_e32 v2, s14
+; SDAG-NEXT: v_mov_b32_e32 v3, s15
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v6, s0
-; SDAG-NEXT: v_mov_b32_e32 v7, s1
-; SDAG-NEXT: v_mov_b32_e32 v8, s2
-; SDAG-NEXT: v_mov_b32_e32 v9, s3
-; SDAG-NEXT: v_mov_b32_e32 v1, s16
+; SDAG-NEXT: v_mov_b32_e32 v4, s0
+; SDAG-NEXT: v_mov_b32_e32 v5, s1
+; SDAG-NEXT: v_mov_b32_e32 v6, s2
+; SDAG-NEXT: v_mov_b32_e32 v7, s3
+; SDAG-NEXT: v_mov_b32_e32 v17, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr:
@@ -1735,30 +1735,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
-; SDAG-NEXT: v_mov_b32_e32 v14, s8
-; SDAG-NEXT: v_mov_b32_e32 v15, s9
-; SDAG-NEXT: v_mov_b32_e32 v16, s10
-; SDAG-NEXT: v_mov_b32_e32 v17, s11
-; SDAG-NEXT: v_mov_b32_e32 v2, s12
-; SDAG-NEXT: v_mov_b32_e32 v3, s13
-; SDAG-NEXT: v_mov_b32_e32 v4, s14
-; SDAG-NEXT: v_mov_b32_e32 v5, s15
+; SDAG-NEXT: v_mov_b32_e32 v12, s8
+; SDAG-NEXT: v_mov_b32_e32 v13, s9
+; SDAG-NEXT: v_mov_b32_e32 v14, s10
+; SDAG-NEXT: v_mov_b32_e32 v15, s11
+; SDAG-NEXT: v_mov_b32_e32 v0, s12
+; SDAG-NEXT: v_mov_b32_e32 v1, s13
+; SDAG-NEXT: v_mov_b32_e32 v2, s14
+; SDAG-NEXT: v_mov_b32_e32 v3, s15
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v6, s0
-; SDAG-NEXT: v_mov_b32_e32 v7, s1
-; SDAG-NEXT: v_mov_b32_e32 v8, s2
-; SDAG-NEXT: v_mov_b32_e32 v9, s3
-; SDAG-NEXT: v_mov_b32_e32 v1, s16
+; SDAG-NEXT: v_mov_b32_e32 v4, s0
+; SDAG-NEXT: v_mov_b32_e32 v5, s1
+; SDAG-NEXT: v_mov_b32_e32 v6, s2
+; SDAG-NEXT: v_mov_b32_e32 v7, s3
+; SDAG-NEXT: v_mov_b32_e32 v17, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr:
@@ -1904,30 +1904,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7]
+; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
-; SDAG-NEXT: v_mov_b32_e32 v14, s8
-; SDAG-NEXT: v_mov_b32_e32 v15, s9
-; SDAG-NEXT: v_mov_b32_e32 v16, s10
-; SDAG-NEXT: v_mov_b32_e32 v17, s11
-; SDAG-NEXT: v_mov_b32_e32 v2, s12
-; SDAG-NEXT: v_mov_b32_e32 v3, s13
-; SDAG-NEXT: v_mov_b32_e32 v4, s14
-; SDAG-NEXT: v_mov_b32_e32 v5, s15
+; SDAG-NEXT: v_mov_b32_e32 v12, s8
+; SDAG-NEXT: v_mov_b32_e32 v13, s9
+; SDAG-NEXT: v_mov_b32_e32 v14, s10
+; SDAG-NEXT: v_mov_b32_e32 v15, s11
+; SDAG-NEXT: v_mov_b32_e32 v0, s12
+; SDAG-NEXT: v_mov_b32_e32 v1, s13
+; SDAG-NEXT: v_mov_b32_e32 v2, s14
+; SDAG-NEXT: v_mov_b32_e32 v3, s15
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v6, s0
-; SDAG-NEXT: v_mov_b32_e32 v7, s1
-; SDAG-NEXT: v_mov_b32_e32 v8, s2
-; SDAG-NEXT: v_mov_b32_e32 v9, s3
-; SDAG-NEXT: v_mov_b32_e32 v1, s16
+; SDAG-NEXT: v_mov_b32_e32 v4, s0
+; SDAG-NEXT: v_mov_b32_e32 v5, s1
+; SDAG-NEXT: v_mov_b32_e32 v6, s2
+; SDAG-NEXT: v_mov_b32_e32 v7, s3
+; SDAG-NEXT: v_mov_b32_e32 v17, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr:
@@ -2081,22 +2081,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v26, s8
-; SDAG-NEXT: v_mov_b32_e32 v27, s9
-; SDAG-NEXT: v_mov_b32_e32 v28, s10
-; SDAG-NEXT: v_mov_b32_e32 v29, s11
-; SDAG-NEXT: v_mov_b32_e32 v18, s12
-; SDAG-NEXT: v_mov_b32_e32 v19, s13
-; SDAG-NEXT: v_mov_b32_e32 v20, s14
-; SDAG-NEXT: v_mov_b32_e32 v21, s15
-; SDAG-NEXT: v_mov_b32_e32 v22, s0
-; SDAG-NEXT: v_mov_b32_e32 v23, s1
-; SDAG-NEXT: v_mov_b32_e32 v24, s2
-; SDAG-NEXT: v_mov_b32_e32 v25, s3
-; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v24, s8
+; SDAG-NEXT: v_mov_b32_e32 v25, s9
+; SDAG-NEXT: v_mov_b32_e32 v26, s10
+; SDAG-NEXT: v_mov_b32_e32 v27, s11
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: v_mov_b32_e32 v20, s0
+; SDAG-NEXT: v_mov_b32_e32 v21, s1
+; SDAG-NEXT: v_mov_b32_e32 v22, s2
+; SDAG-NEXT: v_mov_b32_e32 v23, s3
+; SDAG-NEXT: v_mov_b32_e32 v28, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
@@ -2454,22 +2454,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v26, s8
-; SDAG-NEXT: v_mov_b32_e32 v27, s9
-; SDAG-NEXT: v_mov_b32_e32 v28, s10
-; SDAG-NEXT: v_mov_b32_e32 v29, s11
-; SDAG-NEXT: v_mov_b32_e32 v18, s12
-; SDAG-NEXT: v_mov_b32_e32 v19, s13
-; SDAG-NEXT: v_mov_b32_e32 v20, s14
-; SDAG-NEXT: v_mov_b32_e32 v21, s15
-; SDAG-NEXT: v_mov_b32_e32 v22, s0
-; SDAG-NEXT: v_mov_b32_e32 v23, s1
-; SDAG-NEXT: v_mov_b32_e32 v24, s2
-; SDAG-NEXT: v_mov_b32_e32 v25, s3
-; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v24, s8
+; SDAG-NEXT: v_mov_b32_e32 v25, s9
+; SDAG-NEXT: v_mov_b32_e32 v26, s10
+; SDAG-NEXT: v_mov_b32_e32 v27, s11
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: v_mov_b32_e32 v20, s0
+; SDAG-NEXT: v_mov_b32_e32 v21, s1
+; SDAG-NEXT: v_mov_b32_e32 v22, s2
+; SDAG-NEXT: v_mov_b32_e32 v23, s3
+; SDAG-NEXT: v_mov_b32_e32 v28, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
@@ -2827,22 +2827,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v26, s8
-; SDAG-NEXT: v_mov_b32_e32 v27, s9
-; SDAG-NEXT: v_mov_b32_e32 v28, s10
-; SDAG-NEXT: v_mov_b32_e32 v29, s11
-; SDAG-NEXT: v_mov_b32_e32 v18, s12
-; SDAG-NEXT: v_mov_b32_e32 v19, s13
-; SDAG-NEXT: v_mov_b32_e32 v20, s14
-; SDAG-NEXT: v_mov_b32_e32 v21, s15
-; SDAG-NEXT: v_mov_b32_e32 v22, s0
-; SDAG-NEXT: v_mov_b32_e32 v23, s1
-; SDAG-NEXT: v_mov_b32_e32 v24, s2
-; SDAG-NEXT: v_mov_b32_e32 v25, s3
-; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v24, s8
+; SDAG-NEXT: v_mov_b32_e32 v25, s9
+; SDAG-NEXT: v_mov_b32_e32 v26, s10
+; SDAG-NEXT: v_mov_b32_e32 v27, s11
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: v_mov_b32_e32 v20, s0
+; SDAG-NEXT: v_mov_b32_e32 v21, s1
+; SDAG-NEXT: v_mov_b32_e32 v22, s2
+; SDAG-NEXT: v_mov_b32_e32 v23, s3
+; SDAG-NEXT: v_mov_b32_e32 v28, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
@@ -3200,22 +3200,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v26, s8
-; SDAG-NEXT: v_mov_b32_e32 v27, s9
-; SDAG-NEXT: v_mov_b32_e32 v28, s10
-; SDAG-NEXT: v_mov_b32_e32 v29, s11
-; SDAG-NEXT: v_mov_b32_e32 v18, s12
-; SDAG-NEXT: v_mov_b32_e32 v19, s13
-; SDAG-NEXT: v_mov_b32_e32 v20, s14
-; SDAG-NEXT: v_mov_b32_e32 v21, s15
-; SDAG-NEXT: v_mov_b32_e32 v22, s0
-; SDAG-NEXT: v_mov_b32_e32 v23, s1
-; SDAG-NEXT: v_mov_b32_e32 v24, s2
-; SDAG-NEXT: v_mov_b32_e32 v25, s3
-; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v24, s8
+; SDAG-NEXT: v_mov_b32_e32 v25, s9
+; SDAG-NEXT: v_mov_b32_e32 v26, s10
+; SDAG-NEXT: v_mov_b32_e32 v27, s11
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: v_mov_b32_e32 v20, s0
+; SDAG-NEXT: v_mov_b32_e32 v21, s1
+; SDAG-NEXT: v_mov_b32_e32 v22, s2
+; SDAG-NEXT: v_mov_b32_e32 v23, s3
+; SDAG-NEXT: v_mov_b32_e32 v28, s16
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
+; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
@@ -3552,4 +3552,4 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
ret <16 x float> %result
}
-attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
+attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-agpr-alloc"="0,0" }
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll
index 84123e6..393581f 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll
@@ -141,7 +141,6 @@ define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_fma_mixlo_bf16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
; GFX1250-NEXT: v_fma_mixhi_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_store_b16 v[0:1], v3, off scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index bc25084..5e5e3bf 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -415,11 +415,6 @@ define amdgpu_kernel void @local_volatile_store_0(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
-; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: ds_store_b32 v0, v1
; GFX12-WGP-NEXT: s_endpgm
;
@@ -432,11 +427,6 @@ define amdgpu_kernel void @local_volatile_store_0(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
-; GFX12-CU-NEXT: s_wait_samplecnt 0x0
-; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(3) %out) {
@@ -562,11 +552,6 @@ define amdgpu_kernel void @local_volatile_store_1(
; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
-; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: ds_store_b32 v0, v1
; GFX12-WGP-NEXT: s_endpgm
;
@@ -583,11 +568,6 @@ define amdgpu_kernel void @local_volatile_store_1(
; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
-; GFX12-CU-NEXT: s_wait_samplecnt 0x0
-; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(3) %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/ssubo.ll b/llvm/test/CodeGen/AMDGPU/ssubo.ll
index 053038d..382d892 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubo.ll
@@ -1,14 +1,116 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=VI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s --check-prefix=GFX9
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s --check-prefix=GFX10
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s --check-prefix=GFX11
declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
declare { <2 x i32>, <2 x i1> } @llvm.ssub.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-; FUNC-LABEL: {{^}}ssubo_i64_zext:
define amdgpu_kernel void @ssubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
+; SI-LABEL: ssubo_i64_zext:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: s_sub_u32 s10, s2, s8
+; SI-NEXT: s_subb_u32 s11, s3, s9
+; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
+; SI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[8:9], 0
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT: v_mov_b32_e32 v1, s11
+; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v0
+; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: ssubo_i64_zext:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: s_sub_u32 s6, s2, s4
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: s_subb_u32 s7, s3, s5
+; VI-NEXT: v_cmp_gt_i64_e64 s[8:9], s[4:5], 0
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[1:2]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: ssubo_i64_zext:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: s_sub_u32 s4, s2, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: s_subb_u32 s5, s3, s7
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[8:9], s[6:7], 0
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: ssubo_i64_zext:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_sub_u32 s4, s2, s6
+; GFX10-NEXT: s_subb_u32 s5, s3, s7
+; GFX10-NEXT: v_cmp_gt_i64_e64 s6, s[6:7], 0
+; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3]
+; GFX10-NEXT: s_xor_b32 s2, s6, s2
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX10-NEXT: v_add_co_u32 v0, s2, s4, v0
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s5, 0, s2
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: ssubo_i64_zext:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_sub_u32 s6, s2, s4
+; GFX11-NEXT: s_subb_u32 s7, s3, s5
+; GFX11-NEXT: v_cmp_gt_i64_e64 s4, s[4:5], 0
+; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], s[2:3]
+; GFX11-NEXT: s_xor_b32 s2, s4, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-NEXT: v_add_co_u32 v0, s2, s6, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
%val = extractvalue { i64, i1 } %ssub, 0
%carry = extractvalue { i64, i1 } %ssub, 1
@@ -18,8 +120,102 @@ define amdgpu_kernel void @ssubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
ret void
}
-; FUNC-LABEL: {{^}}s_ssubo_i32:
define amdgpu_kernel void @s_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) nounwind {
+; SI-LABEL: s_ssubo_i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_sub_i32 s12, s8, s9
+; SI-NEXT: s_cmp_gt_i32 s9, 0
+; SI-NEXT: s_cselect_b64 s[10:11], -1, 0
+; SI-NEXT: s_cmp_lt_i32 s12, s8
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_cselect_b64 s[8:9], -1, 0
+; SI-NEXT: v_mov_b32_e32 v0, s12
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_xor_b64 s[4:5], s[10:11], s[8:9]
+; SI-NEXT: s_mov_b32 s0, s2
+; SI-NEXT: s_mov_b32 s1, s3
+; SI-NEXT: s_mov_b32 s2, s6
+; SI-NEXT: s_mov_b32 s3, s7
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_ssubo_i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_sub_i32 s6, s4, s5
+; VI-NEXT: s_cmp_gt_i32 s5, 0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: s_cmp_lt_i32 s6, s4
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT: v_mov_b32_e32 v4, s6
+; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT: flat_store_dword v[0:1], v4
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: flat_store_byte v[2:3], v0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: s_ssubo_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: s_sub_i32 s4, s6, s7
+; GFX9-NEXT: v_sub_i32 v1, s6, v1 clamp
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_ssubo_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_sub_nc_i32 v0, s6, s7 clamp
+; GFX10-NEXT: s_sub_i32 s4, s6, s7
+; GFX10-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s4, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: global_store_dword v1, v2, s[0:1]
+; GFX10-NEXT: global_store_byte v1, v0, s[2:3]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_ssubo_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_sub_nc_i32 v0, s6, s7 clamp
+; GFX11-NEXT: s_sub_i32 s4, s6, s7
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s4, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v1, v2, s[0:1]
+; GFX11-NEXT: global_store_b8 v1, v0, s[2:3]
+; GFX11-NEXT: s_endpgm
%ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
%val = extractvalue { i32, i1 } %ssub, 0
%carry = extractvalue { i32, i1 } %ssub, 1
@@ -28,8 +224,112 @@ define amdgpu_kernel void @s_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
ret void
}
-; FUNC-LABEL: {{^}}v_ssubo_i32:
define amdgpu_kernel void @v_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
+; SI-LABEL: v_ssubo_i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s4
+; SI-NEXT: s_mov_b32 s13, s5
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
+; SI-NEXT: s_mov_b32 s6, s10
+; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: s_mov_b32 s4, s2
+; SI-NEXT: s_mov_b32 s5, s3
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_sub_i32_e32 v2, vcc, v0, v1
+; SI-NEXT: v_cmp_lt_i32_e32 vcc, 0, v1
+; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v0
+; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0
+; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: v_ssubo_i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: flat_load_dword v5, v[2:3]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_sub_u32_e32 v6, vcc, v4, v5
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5
+; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], v6, v4
+; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: flat_store_dword v[0:1], v6
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: flat_store_byte v[2:3], v0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_ssubo_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[12:13]
+; GFX9-NEXT: global_load_dword v2, v0, s[14:15]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_sub_i32 v3, v1, v2 clamp
+; GFX9-NEXT: v_sub_u32_e32 v1, v1, v2
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v1, v3
+; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: global_store_byte v0, v1, s[10:11]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_ssubo_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dword v1, v0, s[12:13]
+; GFX10-NEXT: global_load_dword v2, v0, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_sub_nc_i32 v3, v1, v2 clamp
+; GFX10-NEXT: v_sub_nc_u32_e32 v1, v1, v2
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT: global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_ssubo_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_sub_nc_i32 v3, v1, v2 clamp
+; GFX11-NEXT: v_sub_nc_u32_e32 v1, v1, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v3
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: s_endpgm
%a = load i32, ptr addrspace(1) %aptr, align 4
%b = load i32, ptr addrspace(1) %bptr, align 4
%ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
@@ -40,10 +340,109 @@ define amdgpu_kernel void @v_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
ret void
}
-; FUNC-LABEL: {{^}}s_ssubo_i64:
-; GCN: s_sub_u32
-; GCN: s_subb_u32
define amdgpu_kernel void @s_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) nounwind {
+; SI-LABEL: s_ssubo_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_sub_u32 s12, s4, s6
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: s_subb_u32 s13, s5, s7
+; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
+; SI-NEXT: v_cmp_gt_i64_e64 s[4:5], s[6:7], 0
+; SI-NEXT: v_mov_b32_e32 v0, s12
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: v_mov_b32_e32 v1, s13
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], vcc
+; SI-NEXT: s_mov_b32 s0, s2
+; SI-NEXT: s_mov_b32 s1, s3
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_ssubo_i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_sub_u32 s0, s4, s6
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_subb_u32 s1, s5, s7
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
+; VI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: flat_store_byte v[2:3], v0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: s_ssubo_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_sub_u32 s0, s12, s14
+; GFX9-NEXT: v_mov_b32_e32 v0, s12
+; GFX9-NEXT: v_mov_b32_e32 v1, s13
+; GFX9-NEXT: s_subb_u32 s1, s13, s15
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[14:15], 0
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT: global_store_byte v2, v0, s[10:11]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_ssubo_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_sub_u32 s0, s12, s14
+; GFX10-NEXT: s_subb_u32 s1, s13, s15
+; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[14:15], 0
+; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[12:13]
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: s_xor_b32 s0, s2, s3
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX10-NEXT: global_store_byte v2, v3, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_ssubo_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_sub_u32 s8, s4, s6
+; GFX11-NEXT: s_subb_u32 s9, s5, s7
+; GFX11-NEXT: v_cmp_gt_i64_e64 s6, s[6:7], 0
+; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5]
+; GFX11-NEXT: v_mov_b32_e32 v0, s8
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX11-NEXT: s_xor_b32 s4, s6, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT: s_endpgm
%ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
%val = extractvalue { i64, i1 } %ssub, 0
%carry = extractvalue { i64, i1 } %ssub, 1
@@ -52,16 +451,121 @@ define amdgpu_kernel void @s_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
ret void
}
-; FUNC-LABEL: {{^}}v_ssubo_i64:
-; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc,
-; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc,
-
-; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc,
-; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc,
-
-; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc,
-; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc,
define amdgpu_kernel void @v_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
+; SI-LABEL: v_ssubo_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s4
+; SI-NEXT: s_mov_b32 s13, s5
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
+; SI-NEXT: s_mov_b32 s6, s10
+; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: s_mov_b32 s4, s2
+; SI-NEXT: s_mov_b32 s5, s3
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
+; SI-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc
+; SI-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3]
+; SI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1]
+; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0
+; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: v_ssubo_i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v6, s2
+; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_sub_u32_e32 v8, vcc, v0, v2
+; VI-NEXT: v_subb_u32_e32 v9, vcc, v1, v3, vcc
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3]
+; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1]
+; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9]
+; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: flat_store_byte v[6:7], v0
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_ssubo_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[12:13]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[14:15]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1]
+; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9]
+; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT: global_store_byte v6, v0, s[10:11]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_ssubo_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v6, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[12:13]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
+; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
+; GFX10-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
+; GFX10-NEXT: s_xor_b32 s0, vcc_lo, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX10-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9]
+; GFX10-NEXT: global_store_byte v6, v0, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_ssubo_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[4:11], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v6, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b64 v[0:1], v6, s[8:9]
+; GFX11-NEXT: global_load_b64 v[2:3], v6, s[10:11]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
+; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
+; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
+; GFX11-NEXT: s_xor_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b64 v6, v[4:5], s[4:5]
+; GFX11-NEXT: global_store_b8 v6, v0, s[6:7]
+; GFX11-NEXT: s_endpgm
%a = load i64, ptr addrspace(1) %aptr, align 4
%b = load i64, ptr addrspace(1) %bptr, align 4
%ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
@@ -72,14 +576,134 @@ define amdgpu_kernel void @v_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
ret void
}
-; FUNC-LABEL: {{^}}v_ssubo_v2i32:
-; SICIVI: v_cmp_lt_i32
-; SICIVI: v_cmp_lt_i32
-; SICIVI: v_sub_{{[iu]}}32
-; SICIVI: v_cmp_lt_i32
-; SICIVI: v_cmp_lt_i32
-; SICIVI: v_sub_{{[iu]}}32
define amdgpu_kernel void @v_ssubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
+; SI-LABEL: v_ssubo_v2i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s4
+; SI-NEXT: s_mov_b32 s13, s5
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
+; SI-NEXT: s_mov_b32 s6, s10
+; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_sub_i32_e32 v5, vcc, v1, v3
+; SI-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
+; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 0, v3
+; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], v5, v1
+; SI-NEXT: v_cmp_lt_i32_e32 vcc, 0, v2
+; SI-NEXT: v_cmp_lt_i32_e64 s[2:3], v4, v0
+; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
+; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; SI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3]
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: v_ssubo_v2i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v6, s2
+; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_sub_u32_e32 v9, vcc, v1, v3
+; VI-NEXT: v_sub_u32_e32 v8, vcc, v0, v2
+; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], 0, v3
+; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v1
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 0, v2
+; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v0
+; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
+; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; VI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3]
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9]
+; VI-NEXT: flat_store_dwordx2 v[6:7], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_ssubo_v2i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[12:13]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[14:15]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_sub_u32_e32 v5, v1, v3
+; GFX9-NEXT: v_sub_i32 v1, v1, v3 clamp
+; GFX9-NEXT: v_sub_u32_e32 v4, v0, v2
+; GFX9-NEXT: v_sub_i32 v0, v0, v2 clamp
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v5, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v4, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9]
+; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[10:11]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_ssubo_v2i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v5, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v5, s[12:13]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v5, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_sub_nc_u32_e32 v4, v1, v3
+; GFX10-NEXT: v_sub_nc_i32 v1, v1, v3 clamp
+; GFX10-NEXT: v_sub_nc_u32_e32 v3, v0, v2
+; GFX10-NEXT: v_sub_nc_i32 v0, v0, v2 clamp
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: global_store_dwordx2 v5, v[3:4], s[8:9]
+; GFX10-NEXT: global_store_dwordx2 v5, v[0:1], s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_ssubo_v2i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v5, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b64 v[0:1], v5, s[4:5]
+; GFX11-NEXT: global_load_b64 v[2:3], v5, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_sub_nc_u32_e32 v4, v1, v3
+; GFX11-NEXT: v_sub_nc_i32 v1, v1, v3 clamp
+; GFX11-NEXT: v_sub_nc_u32_e32 v3, v0, v2
+; GFX11-NEXT: v_sub_nc_i32 v0, v0, v2 clamp
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b64 v5, v[3:4], s[0:1]
+; GFX11-NEXT: global_store_b64 v5, v[0:1], s[2:3]
+; GFX11-NEXT: s_endpgm
%a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
%b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
%sadd = call { <2 x i32>, <2 x i1> } @llvm.ssub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index d230ff5..e1574dc 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=VI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s --check-prefix=GFX9
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s --check-prefix=GFX10
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s --check-prefix=GFX11
define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
; SI-LABEL: s_uaddo_i64_zext:
@@ -12,14 +14,14 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_add_u32 s0, s2, s8
; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_addc_u32 s1, s3, s9
+; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -61,6 +63,40 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_uaddo_i64_zext:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_add_u32 s4, s2, s6
+; GFX10-NEXT: s_addc_u32 s5, s3, s7
+; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[4:5], s[2:3]
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX10-NEXT: v_add_co_u32 v0, s2, s4, v0
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s5, 0, s2
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_uaddo_i64_zext:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_add_u32 s4, s2, s4
+; GFX11-NEXT: s_addc_u32 s5, s3, s5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_lt_u64_e64 s2, s[4:5], s[2:3]
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_u32 v0, s2, s4, v0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
%val = extractvalue { i64, i1 } %uadd, 0
%carry = extractvalue { i64, i1 } %uadd, 1
@@ -76,21 +112,21 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-LABEL: s_uaddo_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xd
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s10, s6
-; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: v_mov_b32_e32 v0, s9
; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: s_mov_b32 s8, s2
-; SI-NEXT: s_mov_b32 s9, s3
-; SI-NEXT: v_mov_b32_e32 v0, s13
-; SI-NEXT: v_add_i32_e32 v0, vcc, s12, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, s8, v0
+; SI-NEXT: s_mov_b32 s0, s2
+; SI-NEXT: s_mov_b32 s1, s3
+; SI-NEXT: s_mov_b32 s2, s6
+; SI-NEXT: s_mov_b32 s3, s7
; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0
+; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_uaddo_i32:
@@ -121,6 +157,34 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: global_store_byte v0, v2, s[2:3]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_uaddo_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v1, s4, s6, s7
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_uaddo_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v1, s4, s6, s7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: s_endpgm
%uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %uadd, 0
%carry = extractvalue { i32, i1 } %uadd, 1
@@ -137,17 +201,15 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
-; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -193,6 +255,38 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
; GFX9-NEXT: global_store_byte v0, v2, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_uaddo_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dword v1, v0, s[12:13]
+; GFX10-NEXT: global_load_dword v2, v0, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT: global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_uaddo_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v1, s4, v1, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
@@ -215,17 +309,15 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
-; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -283,6 +375,45 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
; GFX9-NEXT: global_store_byte v0, v2, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_uaddo_i32_novcc:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dword v1, v0, s[12:13]
+; GFX10-NEXT: global_load_dword v2, v0, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_uaddo_i32_novcc:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v1, s4, v1, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
@@ -306,21 +437,21 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_add_u32 s6, s4, s6
-; SI-NEXT: s_addc_u32 s7, s5, s7
-; SI-NEXT: s_mov_b32 s14, s10
-; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s8, s0
-; SI-NEXT: s_mov_b32 s9, s1
-; SI-NEXT: s_mov_b32 s12, s2
-; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: s_addc_u32 s7, s5, s7
; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
; SI-NEXT: v_mov_b32_e32 v2, s6
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: s_mov_b32 s0, s2
+; SI-NEXT: s_mov_b32 s1, s3
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
; SI-NEXT: v_mov_b32_e32 v3, s7
-; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
-; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0
+; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
+; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_uaddo_i64:
@@ -359,6 +490,37 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9]
; GFX9-NEXT: global_store_byte v4, v0, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_uaddo_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_add_u32 s0, s12, s14
+; GFX10-NEXT: s_addc_u32 s1, s13, s15
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[12:13]
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX10-NEXT: global_store_byte v2, v3, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_uaddo_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_add_u32 s6, s4, s6
+; GFX11-NEXT: s_addc_u32 s7, s5, s7
+; GFX11-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT: s_endpgm
%uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
%val = extractvalue { i64, i1 } %uadd, 0
%carry = extractvalue { i64, i1 } %uadd, 1
@@ -375,17 +537,15 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
-; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -393,8 +553,8 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, v0, v2
; SI-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc
-; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
+; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
@@ -437,6 +597,42 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: global_store_byte v4, v0, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_uaddo_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9]
+; GFX10-NEXT: global_store_byte v4, v0, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_uaddo_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5]
+; GFX11-NEXT: global_load_b64 v[2:3], v4, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-NEXT: global_store_b8 v4, v0, s[2:3]
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i64, ptr addrspace(1) %a.ptr
@@ -459,17 +655,15 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
-; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -477,8 +671,8 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0
-; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, v1, v0
+; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
@@ -522,6 +716,42 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_store_short v0, v2, s[8:9]
; GFX9-NEXT: global_store_byte v0, v1, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_uaddo_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_ushort v1, v0, s[12:13]
+; GFX10-NEXT: global_load_ushort v2, v0, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v2
+; GFX10-NEXT: v_cmp_lt_u32_sdwa s0, v2, v1 src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX10-NEXT: global_store_short v0, v2, s[8:9]
+; GFX10-NEXT: global_store_byte v0, v1, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_uaddo_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_d16_b16 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_u16 v2, v0, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_nc_u32_e32 v2, v1, v2
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; GFX11-NEXT: v_cmp_lt_u32_e32 vcc_lo, v3, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v1, s[2:3]
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr
@@ -544,17 +774,15 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
-; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -606,6 +834,42 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_uaddo_v2i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_uaddo_v2i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5]
+; GFX11-NEXT: global_load_b64 v[2:3], v4, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v1, s4, v1, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT: v_add_co_u32 v0, s4, v0, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v4, v[2:3], s[2:3]
+; GFX11-NEXT: s_endpgm
%a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
%b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
%sadd = call { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
@@ -623,26 +887,27 @@ define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s1
-; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; SI-NEXT: s_cmp_eq_u32 s0, s1
+; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_cbranch_scc1 .LBB8_2
; SI-NEXT: ; %bb.1: ; %if
; SI-NEXT: s_xor_b64 s[0:1], vcc, -1
; SI-NEXT: .LBB8_2: ; %exit
; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; SI-NEXT: s_mov_b32 s10, s2
-; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s0, s4
-; SI-NEXT: s_mov_b32 s1, s5
-; SI-NEXT: s_mov_b32 s8, s6
-; SI-NEXT: s_mov_b32 s9, s7
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
+; SI-NEXT: s_mov_b32 s6, s10
+; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_uaddo_clamp_bit:
@@ -687,6 +952,45 @@ define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
; GFX9-NEXT: global_store_dword v1, v0, s[8:9]
; GFX9-NEXT: global_store_byte v1, v2, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_uaddo_clamp_bit:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34
+; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v0, s1, s2, s3
+; GFX10-NEXT: s_cmp_eq_u32 s2, s3
+; GFX10-NEXT: s_cbranch_scc1 .LBB8_2
+; GFX10-NEXT: ; %bb.1: ; %if
+; GFX10-NEXT: s_xor_b32 s0, s1, -1
+; GFX10-NEXT: .LBB8_2: ; %exit
+; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dword v1, v0, s[8:9]
+; GFX10-NEXT: global_store_byte v1, v2, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_uaddo_clamp_bit:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x34
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v0, s1, s2, s3
+; GFX11-NEXT: s_cmp_eq_u32 s2, s3
+; GFX11-NEXT: s_cbranch_scc1 .LBB8_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: s_xor_b32 s0, s1, -1
+; GFX11-NEXT: .LBB8_2: ; %exit
+; GFX11-NEXT: s_load_b128 s[4:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT: global_store_b8 v1, v2, s[6:7]
+; GFX11-NEXT: s_endpgm
entry:
%uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %uadd, 0
@@ -711,19 +1015,19 @@ define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
; SI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s14, s2
-; SI-NEXT: s_mov_b32 s15, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s0, s8
; SI-NEXT: s_mov_b32 s1, s9
-; SI-NEXT: s_mov_b32 s12, s10
-; SI-NEXT: s_mov_b32 s13, s11
+; SI-NEXT: s_mov_b32 s8, s10
+; SI-NEXT: s_mov_b32 s9, s11
+; SI-NEXT: s_mov_b32 s10, s2
+; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
-; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0
+; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_add_i32_e64 v0, s[0:1], v1, v2
; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: v_add_i32_e64 v0, s[0:1], v1, v2
; SI-NEXT: s_cbranch_vccnz .LBB9_2
; SI-NEXT: ; %bb.1: ; %if
; SI-NEXT: s_xor_b64 s[8:9], s[0:1], -1
@@ -786,6 +1090,50 @@ define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
; GFX9-NEXT: global_store_byte v0, v1, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_uaddo_clamp_bit:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dword v1, v0, s[12:13]
+; GFX10-NEXT: global_load_dword v2, v0, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT: v_add_co_u32 v1, s1, v1, v2
+; GFX10-NEXT: s_cbranch_vccnz .LBB9_2
+; GFX10-NEXT: ; %bb.1: ; %if
+; GFX10-NEXT: s_xor_b32 s0, s1, -1
+; GFX10-NEXT: .LBB9_2: ; %exit
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT: global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_uaddo_clamp_bit:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT: v_add_co_u32 v1, s5, v1, v2
+; GFX11-NEXT: s_cbranch_vccnz .LBB9_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: s_xor_b32 s4, s5, -1
+; GFX11-NEXT: .LBB9_2: ; %exit
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -813,23 +1161,23 @@ exit:
define amdgpu_cs void @sv_uaddo_i128(ptr addrspace(1) %out, i128 inreg %a, i128 %b) {
; SI-LABEL: sv_uaddo_i128:
; SI: ; %bb.0:
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; SI-NEXT: v_mov_b32_e32 v6, s1
-; SI-NEXT: v_mov_b32_e32 v7, s2
-; SI-NEXT: v_mov_b32_e32 v8, s3
-; SI-NEXT: s_mov_b32 s4, s6
-; SI-NEXT: s_mov_b32 s5, s6
; SI-NEXT: v_addc_u32_e32 v3, vcc, v6, v3, vcc
-; SI-NEXT: v_addc_u32_e32 v4, vcc, v7, v4, vcc
-; SI-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], v[2:3]
-; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; SI-NEXT: v_addc_u32_e32 v5, vcc, v8, v5, vcc
+; SI-NEXT: v_mov_b32_e32 v6, s2
+; SI-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc
+; SI-NEXT: v_mov_b32_e32 v6, s3
+; SI-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc
+; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[2:3]
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[4:5]
+; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; SI-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5]
+; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; SI-NEXT: s_mov_b32 s5, s6
; SI-NEXT: v_and_b32_e32 v2, 1, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_endpgm
@@ -871,6 +1219,41 @@ define amdgpu_cs void @sv_uaddo_i128(ptr addrspace(1) %out, i128 inreg %a, i128
; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: sv_uaddo_i128:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, s0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v4, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v5, vcc_lo
+; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[2:3]
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[4:5]
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5]
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: sv_uaddo_i128:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, s0, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v4, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[4:5]
+; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[2:3]
+; GFX11-NEXT: v_mov_b16_e32 v2.l, v6.l
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b16 v2.l, v2.l, v3.l, vcc_lo
+; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
%uadd = call { i128, i1 } @llvm.uadd.with.overflow.i128(i128 %a, i128 %b)
%carry = extractvalue { i128, i1 } %uadd, 1
%carry.ext = zext i1 %carry to i32
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index 7d7f1b4..0289dab 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -1,8 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
-
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=VI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s --check-prefix=GFX9
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s --check-prefix=GFX10
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s --check-prefix=GFX11
define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
; SI-LABEL: s_usubo_i64_zext:
@@ -13,14 +14,14 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_sub_u32 s0, s2, s8
; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_subb_u32 s1, s3, s9
+; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -62,6 +63,40 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_usubo_i64_zext:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_sub_u32 s4, s2, s6
+; GFX10-NEXT: s_subb_u32 s5, s3, s7
+; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], s[2:3]
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX10-NEXT: v_add_co_u32 v0, s2, s4, v0
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s5, 0, s2
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_usubo_i64_zext:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_sub_u32 s4, s2, s4
+; GFX11-NEXT: s_subb_u32 s5, s3, s5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], s[2:3]
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_u32 v0, s2, s4, v0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) #0
%val = extractvalue { i64, i1 } %usub, 0
%carry = extractvalue { i64, i1 } %usub, 1
@@ -76,21 +111,21 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-LABEL: s_usubo_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xd
+; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s10, s6
-; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: v_mov_b32_e32 v0, s9
; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: s_mov_b32 s8, s2
-; SI-NEXT: s_mov_b32 s9, s3
-; SI-NEXT: v_mov_b32_e32 v0, s13
-; SI-NEXT: v_sub_i32_e32 v0, vcc, s12, v0
+; SI-NEXT: v_sub_i32_e32 v0, vcc, s8, v0
+; SI-NEXT: s_mov_b32 s0, s2
+; SI-NEXT: s_mov_b32 s1, s3
+; SI-NEXT: s_mov_b32 s2, s6
+; SI-NEXT: s_mov_b32 s3, s7
; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0
+; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_usubo_i32:
@@ -121,6 +156,34 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: global_store_byte v0, v2, s[2:3]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_usubo_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_sub_co_u32 v1, s4, s6, s7
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_usubo_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v1, s4, s6, s7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: s_endpgm
%usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %usub, 0
%carry = extractvalue { i32, i1 } %usub, 1
@@ -137,17 +200,15 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
-; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -193,6 +254,38 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
; GFX9-NEXT: global_store_byte v0, v2, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_usubo_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dword v1, v0, s[12:13]
+; GFX10-NEXT: global_load_dword v2, v0, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_sub_co_u32 v1, s0, v1, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT: global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_usubo_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v1, s4, v1, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
@@ -215,17 +308,15 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
-; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -283,6 +374,45 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
; GFX9-NEXT: global_store_byte v0, v2, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_usubo_i32_novcc:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dword v1, v0, s[12:13]
+; GFX10-NEXT: global_load_dword v2, v0, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_sub_co_u32 v1, s0, v1, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_usubo_i32_novcc:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v1, s4, v1, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
@@ -306,21 +436,21 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_sub_u32 s6, s4, s6
-; SI-NEXT: s_subb_u32 s7, s5, s7
-; SI-NEXT: s_mov_b32 s14, s10
-; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s8, s0
-; SI-NEXT: s_mov_b32 s9, s1
-; SI-NEXT: s_mov_b32 s12, s2
-; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: s_subb_u32 s7, s5, s7
; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
; SI-NEXT: v_mov_b32_e32 v2, s6
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: s_mov_b32 s0, s2
+; SI-NEXT: s_mov_b32 s1, s3
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
; SI-NEXT: v_mov_b32_e32 v3, s7
-; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
-; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0
+; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
+; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_usubo_i64:
@@ -359,6 +489,37 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9]
; GFX9-NEXT: global_store_byte v4, v0, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_usubo_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_sub_u32 s0, s12, s14
+; GFX10-NEXT: s_subb_u32 s1, s13, s15
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], s[12:13]
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX10-NEXT: global_store_byte v2, v3, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_usubo_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_sub_u32 s6, s4, s6
+; GFX11-NEXT: s_subb_u32 s7, s5, s7
+; GFX11-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT: s_endpgm
%usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
%val = extractvalue { i64, i1 } %usub, 0
%carry = extractvalue { i64, i1 } %usub, 1
@@ -375,17 +536,15 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
-; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -393,8 +552,8 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_sub_i32_e32 v2, vcc, v0, v2
; SI-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
-; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
+; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
@@ -437,6 +596,42 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: global_store_byte v4, v0, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_usubo_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2
+; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9]
+; GFX10-NEXT: global_store_byte v4, v0, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_usubo_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5]
+; GFX11-NEXT: global_load_b64 v[2:3], v4, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-NEXT: global_store_b8 v4, v0, s[2:3]
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i64, ptr addrspace(1) %a.ptr
@@ -459,17 +654,15 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
-; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -477,8 +670,8 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0
-; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, v1, v0
+; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
@@ -522,6 +715,42 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: global_store_short v0, v2, s[8:9]
; GFX9-NEXT: global_store_byte v0, v1, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_usubo_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_ushort v1, v0, s[12:13]
+; GFX10-NEXT: global_load_ushort v2, v0, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_sub_nc_u32_e32 v2, v1, v2
+; GFX10-NEXT: v_cmp_gt_u32_sdwa s0, v2, v1 src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX10-NEXT: global_store_short v0, v2, s[8:9]
+; GFX10-NEXT: global_store_byte v0, v1, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_usubo_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_d16_b16 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_u16 v2, v0, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_sub_nc_u32_e32 v2, v1, v2
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, v3, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v1, s[2:3]
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr
@@ -544,17 +773,15 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
-; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
@@ -606,6 +833,42 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_usubo_v2i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_sub_co_u32 v1, s0, v1, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT: v_sub_co_u32 v0, s0, v0, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_usubo_v2i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5]
+; GFX11-NEXT: global_load_b64 v[2:3], v4, s[6:7]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v1, s4, v1, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT: v_sub_co_u32 v0, s4, v0, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v4, v[2:3], s[2:3]
+; GFX11-NEXT: s_endpgm
%a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
%b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
%sadd = call { <2 x i32>, <2 x i1> } @llvm.usub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
@@ -623,26 +886,27 @@ define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s1
-; SI-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
; SI-NEXT: s_cmp_eq_u32 s0, s1
+; SI-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_cbranch_scc1 .LBB8_2
; SI-NEXT: ; %bb.1: ; %if
; SI-NEXT: s_xor_b64 s[0:1], vcc, -1
; SI-NEXT: .LBB8_2: ; %exit
; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; SI-NEXT: s_mov_b32 s10, s2
-; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s0, s4
-; SI-NEXT: s_mov_b32 s1, s5
-; SI-NEXT: s_mov_b32 s8, s6
-; SI-NEXT: s_mov_b32 s9, s7
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s7
+; SI-NEXT: s_mov_b32 s6, s10
+; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_usubo_clamp_bit:
@@ -687,6 +951,45 @@ define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
; GFX9-NEXT: global_store_dword v1, v0, s[8:9]
; GFX9-NEXT: global_store_byte v1, v2, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: s_usubo_clamp_bit:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34
+; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_sub_co_u32 v0, s1, s2, s3
+; GFX10-NEXT: s_cmp_eq_u32 s2, s3
+; GFX10-NEXT: s_cbranch_scc1 .LBB8_2
+; GFX10-NEXT: ; %bb.1: ; %if
+; GFX10-NEXT: s_xor_b32 s0, s1, -1
+; GFX10-NEXT: .LBB8_2: ; %exit
+; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dword v1, v0, s[8:9]
+; GFX10-NEXT: global_store_byte v1, v2, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_usubo_clamp_bit:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x34
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v0, s1, s2, s3
+; GFX11-NEXT: s_cmp_eq_u32 s2, s3
+; GFX11-NEXT: s_cbranch_scc1 .LBB8_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: s_xor_b32 s0, s1, -1
+; GFX11-NEXT: .LBB8_2: ; %exit
+; GFX11-NEXT: s_load_b128 s[4:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT: global_store_b8 v1, v2, s[6:7]
+; GFX11-NEXT: s_endpgm
entry:
%usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %usub, 0
@@ -712,19 +1015,19 @@ define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
; SI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s14, s2
-; SI-NEXT: s_mov_b32 s15, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s0, s8
; SI-NEXT: s_mov_b32 s1, s9
-; SI-NEXT: s_mov_b32 s12, s10
-; SI-NEXT: s_mov_b32 s13, s11
+; SI-NEXT: s_mov_b32 s8, s10
+; SI-NEXT: s_mov_b32 s9, s11
+; SI-NEXT: s_mov_b32 s10, s2
+; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
-; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0
+; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_sub_i32_e64 v0, s[0:1], v1, v2
; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: v_sub_i32_e64 v0, s[0:1], v1, v2
; SI-NEXT: s_cbranch_vccnz .LBB9_2
; SI-NEXT: ; %bb.1: ; %if
; SI-NEXT: s_xor_b64 s[8:9], s[0:1], -1
@@ -787,6 +1090,50 @@ define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
; GFX9-NEXT: global_store_byte v0, v1, s[10:11]
; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_usubo_clamp_bit:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dword v1, v0, s[12:13]
+; GFX10-NEXT: global_load_dword v2, v0, s[14:15]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT: v_sub_co_u32 v1, s1, v1, v2
+; GFX10-NEXT: s_cbranch_vccnz .LBB9_2
+; GFX10-NEXT: ; %bb.1: ; %if
+; GFX10-NEXT: s_xor_b32 s0, s1, -1
+; GFX10-NEXT: .LBB9_2: ; %exit
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT: global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_usubo_clamp_bit:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b32 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT: v_sub_co_u32 v1, s5, v1, v2
+; GFX11-NEXT: s_cbranch_vccnz .LBB9_2
+; GFX11-NEXT: ; %bb.1: ; %if
+; GFX11-NEXT: s_xor_b32 s4, s5, -1
+; GFX11-NEXT: .LBB9_2: ; %exit
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64