aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/dereferenceable-declaration.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll3215
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll17
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-assert-sext.mir170
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-assert-zext.mir78
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smax.mir6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smin.mir6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umax.mir17
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umin.mir17
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll325
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll344
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll108
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll32
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis-heuristics.ll42
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll110
-rw-r--r--llvm/test/CodeGen/AMDGPU/bf16.ll32
-rw-r--r--llvm/test/CodeGen/AMDGPU/coalescer-avoid-coalesce-class-with-no-registers.ll27
-rw-r--r--llvm/test/CodeGen/AMDGPU/coalescer-avoid-coalesce-class-with-no-registers.mir34
-rw-r--r--llvm/test/CodeGen/AMDGPU/fabs.bf16.ll48
-rw-r--r--llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll138
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmed3.ll46
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll151
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg.bf16.ll85
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll296
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll72
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.exp.ll15
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.exp10.ll15
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll76
-rw-r--r--llvm/test/CodeGen/AMDGPU/maximumnum.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/mfma-loop.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/minimumnum.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/regpressure_printer.mir18
-rw-r--r--llvm/test/CodeGen/AMDGPU/s_cmp_0.ll625
-rw-r--r--llvm/test/CodeGen/AMDGPU/uniform-select.ll64
-rw-r--r--llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir514
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll52
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll51
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll52
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll52
49 files changed, 3943 insertions, 3147 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dereferenceable-declaration.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dereferenceable-declaration.ll
index c92e5c5..edb3607 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dereferenceable-declaration.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dereferenceable-declaration.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -stop-after=irtranslator -o - %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=fiji -stop-after=irtranslator -o - %s | FileCheck %s
declare align(8) dereferenceable(8) ptr @declared_with_ret_deref() #0
declare align(8) ptr @unknown_decl() #0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 1aee6ab..1b879a6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -403,40 +403,38 @@ define half @v_neg_rcp_f16(half %x) {
; GFX6-IEEE-LABEL: v_neg_rcp_f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_neg_rcp_f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
@@ -460,40 +458,38 @@ define half @v_rcp_f16(half %x) {
; GFX6-IEEE-LABEL: v_rcp_f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
@@ -517,40 +513,38 @@ define half @v_rcp_f16_arcp(half %x) {
; GFX6-IEEE-LABEL: v_rcp_f16_arcp:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_f16_arcp:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
@@ -575,9 +569,7 @@ define half @v_rcp_f16_arcp_afn(half %x) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -600,40 +592,38 @@ define half @v_rcp_f16_ulp25(half %x) {
; GFX6-IEEE-LABEL: v_rcp_f16_ulp25:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_f16_ulp25:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
@@ -1454,70 +1444,67 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX6-IEEE-LABEL: v_rcp_v2f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_v2f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -1526,30 +1513,27 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1561,26 +1545,23 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX8-FLUSH-LABEL: v_rcp_v2f16:
; GFX8-FLUSH: ; %bb.0:
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v1, v4, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1594,30 +1575,27 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1628,26 +1606,24 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX9-FLUSH-LABEL: v_rcp_v2f16:
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, 1.0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v1, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v1, v1
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1660,30 +1636,27 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, 1.0, v2
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -1696,24 +1669,21 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -1726,27 +1696,25 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX11-NEXT: v_mov_b32_e32 v4, 1.0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX11-NEXT: v_fma_mix_f32 v5, -v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v5, v5, v2, v2
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x
@@ -1757,70 +1725,67 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX6-IEEE-LABEL: v_neg_rcp_v2f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_neg_rcp_v2f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -1829,30 +1794,27 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, -1.0, v1
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v4, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -1864,26 +1826,23 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX8-FLUSH-LABEL: v_neg_rcp_v2f16:
; GFX8-FLUSH: ; %bb.0:
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v1, v4, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -1897,30 +1856,27 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX9-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, -1.0, v1
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_sub_f32_e32 v4, v7, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -1931,26 +1887,24 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX9-FLUSH-LABEL: v_neg_rcp_v2f16:
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, -1.0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, v0, v1, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v1, -v1
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, -v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -1963,30 +1917,27 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_sub_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, -1.0, v2
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -1999,24 +1950,21 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -2029,27 +1977,25 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX11-NEXT: v_mov_b32_e32 v4, -1.0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX11-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v6, v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v5, v5, v2, -v2
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x half> <half -1.0, half -1.0>, %x
@@ -2064,33 +2010,32 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v0
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v2, v1
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v5, v5, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v5, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v5, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v1, 1.0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v4, v4, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v5, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v1, v5, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v2, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v5, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v1, v4, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -2101,39 +2046,37 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v2, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v5, v5, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v3, v3, 1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v4, v5, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v3, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v3, v2
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v5, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v2, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v5, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v3, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -2143,30 +2086,27 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX8-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -2179,26 +2119,23 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX8-FLUSH: ; %bb.0:
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v1, v4, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -2213,30 +2150,27 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX9-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -2248,26 +2182,24 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, 1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v5, 1.0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v2
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v5, v4
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v6, v7, v2
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v7, v4
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v2, v5 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -|v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v2, v2
+; GFX9-FLUSH-NEXT: v_mad_f32 v7, v7, v4, v4
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v8, v2
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0
; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v7
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
@@ -2279,32 +2211,29 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX10-IEEE: ; %bb.0:
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, 1.0, v2
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -2316,26 +2245,23 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX10-FLUSH: ; %bb.0:
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -2346,30 +2272,30 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX11-LABEL: v_rcp_v2f16_fabs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v5, 1.0
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX11-NEXT: v_rcp_f32_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v6, v5, v3
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1]
; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
-; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v3
; GFX11-NEXT: v_rcp_f32_e32 v4, v4
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_mov_b32_e32 v5, 1.0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v5, v4
-; GFX11-NEXT: v_fma_mix_f32 v8, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v8, v4
-; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v3, v7, v3 :: v_dual_mul_f32 v0, v0, v4
+; GFX11-NEXT: v_fma_mix_f32 v7, -|v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v7, v7, v4, v4
+; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX11-NEXT: v_and_b32_e32 v0, 0xff800000, v0
-; GFX11-NEXT: v_dual_add_f32 v0, v0, v5 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v7
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v0, v2, 1.0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_fma_mix_f32 v6, -v1, v3, v5 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3
+; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_mul_f32_e32 v3, v8, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -2386,33 +2312,32 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v0
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v2, v1
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v5, v5, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v5, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v5, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v1, -1.0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v4, v4, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v4, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v5, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v1, v5, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v2, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v5, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v1, v4, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -2423,39 +2348,37 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v2, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v5, v5, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v3, v3, -1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v4, v5, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v3, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v3, v2
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v5, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v2, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v5, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v3, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -2465,30 +2388,27 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX8-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, -1.0, v1
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v4, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -2501,26 +2421,23 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX8-FLUSH: ; %bb.0:
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v1, v4, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -2535,30 +2452,27 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX9-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX9-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, -1.0, v1
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_sub_f32_e32 v4, v7, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -2570,26 +2484,24 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, -1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v5, -1.0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v2
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v5, v4
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v6, v7, v2
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v7, v4
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, v1, v2, v5 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, |v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v2, -v2
+; GFX9-FLUSH-NEXT: v_mad_f32 v7, v7, v4, -v4
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v8, v2
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0
; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v7
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
@@ -2601,32 +2513,29 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX10-IEEE: ; %bb.0:
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_sub_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, -1.0, v2
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -2638,26 +2547,23 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX10-FLUSH: ; %bb.0:
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -2668,30 +2574,30 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX11-LABEL: v_neg_rcp_v2f16_fabs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v5, -1.0
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX11-NEXT: v_rcp_f32_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v6, v5, v3
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1]
; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
-; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v3
; GFX11-NEXT: v_rcp_f32_e32 v4, v4
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_mov_b32_e32 v5, -1.0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v5, v4
-; GFX11-NEXT: v_fma_mix_f32 v8, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v8, v4
-; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v3, v7, v3 :: v_dual_mul_f32 v0, v0, v4
+; GFX11-NEXT: v_fma_mix_f32 v7, |v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v7, v7, v4, -v4
+; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX11-NEXT: v_and_b32_e32 v0, 0xff800000, v0
-; GFX11-NEXT: v_dual_add_f32 v0, v0, v5 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v7
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v0, v2, -1.0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_fma_mix_f32 v6, v1, v3, v5 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3
+; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_mul_f32_e32 v3, v8, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -2704,70 +2610,67 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
; GFX6-IEEE-LABEL: v_rcp_v2f16_arcp:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -2814,11 +2717,8 @@ define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) {
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
; GFX6-NEXT: v_rcp_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_f32_e32 v0, v2, v0
-; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -2864,70 +2764,67 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX6-IEEE-LABEL: v_rcp_v2f16_ulp25:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_v2f16_ulp25:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -2936,30 +2833,27 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -2971,26 +2865,23 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX8-FLUSH-LABEL: v_rcp_v2f16_ulp25:
; GFX8-FLUSH: ; %bb.0:
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v1, v4, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -3004,30 +2895,27 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -3038,26 +2926,24 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX9-FLUSH-LABEL: v_rcp_v2f16_ulp25:
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, 1.0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v1, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v1, v1
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -3070,30 +2956,27 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, 1.0, v2
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -3106,24 +2989,21 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -3136,27 +3016,25 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX11-NEXT: v_mov_b32_e32 v4, 1.0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX11-NEXT: v_fma_mix_f32 v5, -v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v5, v5, v2, v2
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x
@@ -4033,40 +3911,38 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
define amdgpu_ps i16 @s_rcp_f16(i16 inreg %a.arg) {
; GFX6-IEEE-LABEL: s_rcp_f16:
; GFX6-IEEE: ; %bb.0:
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, 1.0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-IEEE-NEXT: ; return to shader part epilog
;
; GFX6-FLUSH-LABEL: s_rcp_f16:
; GFX6-FLUSH: ; %bb.0:
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, 1.0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
@@ -4099,40 +3975,38 @@ define amdgpu_ps i16 @s_rcp_f16(i16 inreg %a.arg) {
define amdgpu_ps i16 @s_neg_rcp_f16(i16 inreg %a.arg) {
; GFX6-IEEE-LABEL: s_neg_rcp_f16:
; GFX6-IEEE: ; %bb.0:
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, -1.0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-IEEE-NEXT: ; return to shader part epilog
;
; GFX6-FLUSH-LABEL: s_neg_rcp_f16:
; GFX6-FLUSH: ; %bb.0:
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, -1.0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
@@ -4166,21 +4040,20 @@ define amdgpu_ps i16 @s_rsq_f16(i16 inreg %a.arg) {
; GFX6-IEEE-LABEL: s_rsq_f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-IEEE-NEXT: ; return to shader part epilog
@@ -4188,24 +4061,23 @@ define amdgpu_ps i16 @s_rsq_f16(i16 inreg %a.arg) {
; GFX6-FLUSH-LABEL: s_rsq_f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-FLUSH-NEXT: ; return to shader part epilog
@@ -4241,36 +4113,35 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
; GFX6-IEEE-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[0:1], v0, v0, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v6, s[0:1], v1, v1, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v5, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v9, v5, v5
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v5
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v5, v9
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v6, v8, 1.0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[0:1], v2, v1, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v6, v4, v7
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v3, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v6, v4, v7
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[0:1], v1, v1, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v8, -v2, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v8, v4, v4
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v5
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v3, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v10, -v2, v9, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v4, v9
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v9, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v8, -v5, v6, 1.0
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v9
+; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[0:1], -1.0, v1, -1.0
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v8, v6, v6
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v3, v7, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v3, v7
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v4, v2, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v3, v7
; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[0:1]
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v4, v2, v3
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -4283,42 +4154,40 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
; GFX6-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[0:1], v0, v0, v2
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, -1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1
@@ -4330,31 +4199,28 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v0, s0
; GFX8-IEEE-NEXT: s_lshr_b32 s0, s0, 16
; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v1, s0
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v2, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v10, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v8, v9
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, -1.0, v2
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
@@ -4369,25 +4235,22 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0
; GFX8-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v1, s0
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v2, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
@@ -4402,25 +4265,22 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v0, s0
; GFX9-IEEE-NEXT: s_lshr_b32 s0, s0, 16
; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, s0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_fma_f32 v8, -v2, v7, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX9-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7
-; GFX9-IEEE-NEXT: v_fma_f32 v8, -v3, v9, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v8, v8, v6, v9
-; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_fma_f32 v6, v2, v4, -1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v6, v6, v4, -v4
+; GFX9-IEEE-NEXT: v_fma_f32 v7, v3, v5, -1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v7, v7, v5, -v5
+; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v6, -1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v7, -1.0
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -4434,25 +4294,23 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0
; GFX9-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, s0
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, -1.0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v2, -v2
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, -v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v7, v2
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v6, v3
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -4466,25 +4324,23 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX10-IEEE-NEXT: s_lshr_b32 s1, s0, 16
; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v0, s0
; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, s1
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX10-IEEE-NEXT: v_mov_b32_e32 v4, -1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_f32 v5, v5, v2, -v2
+; GFX10-IEEE-NEXT: v_fma_f32 v6, v6, v3, -v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v7, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -4498,25 +4354,22 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX10-FLUSH-NEXT: s_lshr_b32 s1, s0, 16
; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0
; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v1, s1
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -4530,29 +4383,27 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX11-NEXT: s_lshr_b32 s1, s0, 16
; GFX11-NEXT: v_sqrt_f16_e32 v0, s0
; GFX11-NEXT: v_sqrt_f16_e32 v1, s1
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX11-NEXT: v_mov_b32_e32 v4, -1.0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX11-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v5, v5, v2, -v2
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
@@ -4568,21 +4419,20 @@ define half @v_rsq_f16(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4590,24 +4440,23 @@ define half @v_rsq_f16(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -4632,21 +4481,20 @@ define half @v_neg_rsq_f16(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4654,24 +4502,23 @@ define half @v_neg_rsq_f16(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -4706,21 +4553,20 @@ define { half, half } @v_rsq_f16_multi_use(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v2, v1
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4728,24 +4574,23 @@ define { half, half } @v_rsq_f16_multi_use(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v3, v2, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -4785,21 +4630,20 @@ define half @v_rsq_f16_missing_contract0(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4807,24 +4651,23 @@ define half @v_rsq_f16_missing_contract0(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -4859,21 +4702,20 @@ define half @v_rsq_f16_missing_contract1(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4881,24 +4723,23 @@ define half @v_rsq_f16_missing_contract1(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -4933,21 +4774,20 @@ define half @v_neg_rsq_f16_missing_contract0(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4955,24 +4795,23 @@ define half @v_neg_rsq_f16_missing_contract0(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5007,21 +4846,20 @@ define half @v_neg_rsq_f16_missing_contract1(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -5029,24 +4867,23 @@ define half @v_neg_rsq_f16_missing_contract1(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5081,21 +4918,20 @@ define half @v_neg_rsq_f16_fabs(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -5103,24 +4939,23 @@ define half @v_neg_rsq_f16_fabs(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5156,21 +4991,20 @@ define half @v_rsq_f16_arcp(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -5178,24 +5012,23 @@ define half @v_rsq_f16_arcp(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5220,21 +5053,20 @@ define half @v_neg_rsq_f16_arcp(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -5242,24 +5074,23 @@ define half @v_neg_rsq_f16_arcp(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5294,12 +5125,10 @@ define half @v_rsq_f16_afn(half %a) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -5324,12 +5153,10 @@ define half @v_rsq_f16_afn_nocontract(half %a) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -5365,36 +5192,35 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v6, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v9, v6, v6
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v6, v9
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v6, v9
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v5, v8, 1.0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v2, v1, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v5, v4, v7
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v3, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v5, v4, v7
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v1, v1, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v8, -v2, v5, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v8, v5, v5
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v8, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v8, v10, v5, v8
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v9, -v4, v6, 1.0
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v5, v8
+; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], 1.0, v1, 1.0
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v9, v6, v6
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v3, v7, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v4, v3, v7
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v2, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v3, v7
; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[4:5]
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v4, v2, v3
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
@@ -5404,42 +5230,40 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5448,31 +5272,28 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v9, v9, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v10, v10, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v9, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, 1.0, v2
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
@@ -5486,25 +5307,22 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v8, v4, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
@@ -5518,25 +5336,22 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6
-; GFX9-IEEE-NEXT: v_fma_f32 v9, -v2, v7, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v10, -v3, v8, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v7, v9, v5, v7
-; GFX9-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8
-; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v6, v6, v4, v4
+; GFX9-IEEE-NEXT: v_fma_f32 v7, v7, v5, v5
+; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v6, 1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v7, 1.0
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
@@ -5549,25 +5364,23 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, 1.0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v7, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v6, v2
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v1, v2, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v3, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v2, v2
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v5, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v7, v2
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
@@ -5580,25 +5393,23 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX10-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX10-IEEE-NEXT: v_mov_b32_e32 v4, 1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, -v1, v2, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v3, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_f32 v5, v5, v2, v2
+; GFX10-IEEE-NEXT: v_fma_f32 v6, v6, v3, v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v5, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v7, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
@@ -5611,25 +5422,22 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX10-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
@@ -5642,7 +5450,7 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX11-NEXT: v_mov_b32_e32 v4, 1.0
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
@@ -5650,22 +5458,20 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX11-NEXT: v_fma_mix_f32 v5, -v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v6, -v1, v3, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v5, v5, v2, v2
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
@@ -5679,36 +5485,35 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v6, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v9, v6, v6
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v6, v9
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v6, v9
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v5, v8, 1.0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v2, v1, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v5, v4, v7
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v3, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v5, v4, v7
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v1, v1, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v8, -v2, v5, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v8, v5, v5
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v8, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v8, v10, v5, v8
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v9, -v4, v6, 1.0
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v5, v8
+; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], -1.0, v1, -1.0
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v9, v6, v6
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v3, v7, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v4, v3, v7
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v2, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v3, v7
; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[4:5]
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v4, v2, v3
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
@@ -5718,42 +5523,40 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5762,31 +5565,28 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v9, v9, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v10, v10, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v9, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, -1.0, v2
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
@@ -5800,25 +5600,22 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v8, v4, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
@@ -5832,25 +5629,22 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6
-; GFX9-IEEE-NEXT: v_fma_f32 v9, -v2, v7, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v10, -v3, v8, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v7, v9, v5, v7
-; GFX9-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8
-; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_fma_f32 v6, v2, v4, -1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v7, v3, v5, -1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v6, v6, v4, -v4
+; GFX9-IEEE-NEXT: v_fma_f32 v7, v7, v5, -v5
+; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v6, -1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v7, -1.0
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
@@ -5863,25 +5657,23 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, -1.0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v7, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v6, v2
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, v1, v2, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, v0, v3, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v2, -v2
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, -v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v5, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v7, v2
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
@@ -5894,25 +5686,23 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX10-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX10-IEEE-NEXT: v_mov_b32_e32 v4, -1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, v1, v2, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, v0, v3, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_f32 v5, v5, v2, -v2
+; GFX10-IEEE-NEXT: v_fma_f32 v6, v6, v3, -v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v5, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v7, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
@@ -5925,25 +5715,22 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX10-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
@@ -5956,7 +5743,7 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX11-NEXT: v_mov_b32_e32 v4, -1.0
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
@@ -5964,22 +5751,20 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX11-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v5, v5, v2, -v2
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index e6a8bac..2356dad 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -1,15 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -global-isel -mattr=-promote-alloca < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -global-isel -mattr=-promote-alloca < %s | FileCheck -check-prefix=GFX11 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel -mattr=-promote-alloca < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel -new-reg-bank-select -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -global-isel -new-reg-bank-select -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -global-isel -new-reg-bank-select -mattr=-promote-alloca < %s | FileCheck -check-prefix=GFX942 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -global-isel -new-reg-bank-select -mattr=-promote-alloca < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel -new-reg-bank-select -mattr=-promote-alloca < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=UNALIGNED_GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=UNALIGNED_GFX10 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca < %s | FileCheck -check-prefixes=UNALIGNED_GFX942 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca < %s | FileCheck -check-prefixes=UNALIGNED_GFX11 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca < %s | FileCheck -check-prefixes=UNALIGNED_GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel -new-reg-bank-select -mattr=-unaligned-access-mode -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=UNALIGNED_GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -global-isel -new-reg-bank-select -mattr=-unaligned-access-mode -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=UNALIGNED_GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -global-isel -new-reg-bank-select -mattr=-unaligned-access-mode -mattr=-promote-alloca < %s | FileCheck -check-prefixes=UNALIGNED_GFX942 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -global-isel -new-reg-bank-select -mattr=-unaligned-access-mode -mattr=-promote-alloca < %s | FileCheck -check-prefixes=UNALIGNED_GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel -new-reg-bank-select -mattr=-unaligned-access-mode -mattr=-promote-alloca < %s | FileCheck -check-prefixes=UNALIGNED_GFX12 %s
define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
; GFX9-LABEL: store_load_sindex_kernel:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index 302b239..549af87 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -88,11 +88,10 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; CI-NEXT: v_or_b32_e32 v1, s4, v0
; CI-NEXT: .LBB0_8: ; %Flow19
; CI-NEXT: v_cvt_f32_f16_e32 v0, s3
-; CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; CI-NEXT: s_and_b32 s2, s2, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00
; CI-NEXT: s_cselect_b32 s2, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v2
+; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v0
; CI-NEXT: v_mov_b32_e32 v0, 0x7e00
; CI-NEXT: s_and_b32 s2, 1, s2
; CI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
@@ -1197,16 +1196,15 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_or_b32_e32 v1, s4, v1
; CI-NEXT: .LBB9_16: ; %Flow54
; CI-NEXT: v_cvt_f32_f16_e32 v2, s1
-; CI-NEXT: v_cvt_f32_f16_e32 v3, 0
; CI-NEXT: s_and_b32 s0, s0, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s0, 0x7c00
; CI-NEXT: s_cselect_b32 s4, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v2, v3
+; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v2
; CI-NEXT: v_cvt_f32_f16_e32 v2, s3
; CI-NEXT: s_and_b32 s2, s2, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00
; CI-NEXT: s_cselect_b32 s2, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], v2, v3
+; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], 0, v2
; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; CI-NEXT: v_mov_b32_e32 v2, 0x7e00
; CI-NEXT: s_and_b32 s3, 1, s4
@@ -1730,26 +1728,25 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_or_b32_e32 v3, s1, v3
; CI-NEXT: .LBB10_32: ; %Flow124
; CI-NEXT: v_cvt_f32_f16_e32 v4, s2
-; CI-NEXT: v_cvt_f32_f16_e32 v5, 0
; CI-NEXT: s_and_b32 s1, s4, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s1, 0x7c00
; CI-NEXT: s_cselect_b32 s11, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v4, v5
+; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v4
; CI-NEXT: v_cvt_f32_f16_e32 v4, s0
; CI-NEXT: s_and_b32 s2, s6, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00
; CI-NEXT: s_cselect_b32 s6, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], v4, v5
+; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], 0, v4
; CI-NEXT: v_cvt_f32_f16_e32 v4, s3
; CI-NEXT: s_and_b32 s4, s5, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s4, 0x7c00
; CI-NEXT: s_cselect_b32 s12, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e64 s[2:3], v4, v5
+; CI-NEXT: v_cmp_nlg_f32_e64 s[2:3], 0, v4
; CI-NEXT: v_cvt_f32_f16_e32 v4, s10
; CI-NEXT: s_and_b32 s7, s7, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s7, 0x7c00
; CI-NEXT: s_cselect_b32 s7, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e64 s[4:5], v4, v5
+; CI-NEXT: v_cmp_nlg_f32_e64 s[4:5], 0, v4
; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; CI-NEXT: v_mov_b32_e32 v4, 0x7e00
; CI-NEXT: s_and_b32 s10, 1, s11
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
index 920d8fa..ae7f6ec 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
define amdgpu_kernel void @v_insert_v64i32_37(ptr addrspace(1) %ptr.in, ptr addrspace(1) %ptr.out) #0 {
; GCN-LABEL: v_insert_v64i32_37:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll
index cfbb429..aabf256 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll
@@ -1,11 +1,11 @@
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-WGP %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-WGP %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-WGP %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-WGP %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-WGP %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-WGP %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED %s
; GCN-LABEL: test_local_misaligned_v2:
; GCN-DAG: ds_{{read2|load_2addr}}_b32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
index 66cdfc2..7b923f4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
@@ -1,14 +1,14 @@
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor -o %t.v4.ll
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor -o %t.v6.ll
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa < %t.v4.ll | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa < %t.v4.ll | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -mattr=+flat-for-global < %t.v4.ll | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global < %t.v4.ll | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global -mcpu=hawaii < %t.v4.ll | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga < %t.v4.ll | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a < %t.v4.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %t.v4.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx11-generic -amdgpu-enable-vopd=0 < %t.v6.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-unknown-amdhsa < %t.v4.ll | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-unknown-amdhsa < %t.v4.ll | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-- -mcpu=hawaii -mattr=+flat-for-global < %t.v4.ll | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global < %t.v4.ll | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global -mcpu=hawaii < %t.v4.ll | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga < %t.v4.ll | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a < %t.v4.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %t.v4.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx11-generic -amdgpu-enable-vopd=0 < %t.v6.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s
declare i32 @llvm.amdgcn.workitem.id.x() #0
declare i32 @llvm.amdgcn.workitem.id.y() #0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-assert-sext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-assert-sext.mir
new file mode 100644
index 0000000..87836e2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-assert-sext.mir
@@ -0,0 +1,170 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=amdgpu-regbankselect,amdgpu-regbanklegalize %s -verify-machineinstrs -o - | FileCheck %s
+
+---
+name: assert_sext_vgpr
+alignment: 4
+legalized: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: assert_sext_vgpr
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %copy:vgpr(s32) = COPY $vgpr0
+ ; CHECK-NEXT: %assert_sext:vgpr(s32) = G_ASSERT_SEXT %copy, 4
+ ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_sext(s32)
+ %copy:_(s32) = COPY $vgpr0
+ %assert_sext:_(s32) = G_ASSERT_SEXT %copy, 4
+ S_ENDPGM 0, implicit %assert_sext
+...
+
+---
+name: assert_sext_sgpr
+alignment: 4
+legalized: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr8
+
+ ; CHECK-LABEL: name: assert_sext_sgpr
+ ; CHECK: liveins: $sgpr8
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %copy:sgpr(s32) = COPY $sgpr8
+ ; CHECK-NEXT: %assert_sext:sgpr(s32) = G_ASSERT_SEXT %copy, 4
+ ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_sext(s32)
+ %copy:_(s32) = COPY $sgpr8
+ %assert_sext:_(s32) = G_ASSERT_SEXT %copy, 4
+ S_ENDPGM 0, implicit %assert_sext
+...
+
+---
+name: assert_sext_agpr
+alignment: 4
+legalized: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $agpr0
+
+ ; CHECK-LABEL: name: assert_sext_agpr
+ ; CHECK: liveins: $agpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %copy:vgpr(s32) = COPY $agpr0
+ ; CHECK-NEXT: %assert_sext:vgpr(s32) = G_ASSERT_SEXT %copy, 4
+ ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_sext(s32)
+ %copy:_(s32) = COPY $agpr0
+ %assert_sext:_(s32) = G_ASSERT_SEXT %copy, 4
+ S_ENDPGM 0, implicit %assert_sext
+...
+
+---
+name: assert_sext_vgpr_regclass
+alignment: 4
+legalized: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: assert_sext_vgpr_regclass
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %copy:vgpr_32(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY %copy(s32)
+ ; CHECK-NEXT: %assert_sext:vgpr(s32) = G_ASSERT_SEXT [[COPY]], 4
+ ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_sext(s32)
+ %copy:vgpr_32(s32) = COPY $vgpr0
+ %assert_sext:_(s32) = G_ASSERT_SEXT %copy, 4
+ S_ENDPGM 0, implicit %assert_sext
+...
+
+---
+name: assert_sext_sgpr_regcllass
+alignment: 4
+legalized: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr8
+
+ ; CHECK-LABEL: name: assert_sext_sgpr_regcllass
+ ; CHECK: liveins: $sgpr8
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %copy:sgpr_32(s32) = COPY $sgpr8
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY %copy(s32)
+ ; CHECK-NEXT: %assert_sext:sgpr(s32) = G_ASSERT_SEXT [[COPY]], 4
+ ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_sext(s32)
+ %copy:sgpr_32(s32) = COPY $sgpr8
+ %assert_sext:_(s32) = G_ASSERT_SEXT %copy, 4
+ S_ENDPGM 0, implicit %assert_sext
+...
+
+---
+name: assert_sext_vgpr_64
+alignment: 4
+legalized: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+
+ ; CHECK-LABEL: name: assert_sext_vgpr_64
+ ; CHECK: liveins: $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %copy:vreg_64(s64) = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY %copy(s64)
+ ; CHECK-NEXT: [[ASSERT_SEXT:%[0-9]+]]:vgpr(s64) = G_ASSERT_SEXT [[COPY]], 4
+ ; CHECK-NEXT: %assert_sext:vreg_64(s64) = COPY [[ASSERT_SEXT]](s64)
+ ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_sext(s64)
+ %copy:vreg_64(s64) = COPY $vgpr0_vgpr1
+ %assert_sext:vreg_64(s64) = G_ASSERT_SEXT %copy, 4
+ S_ENDPGM 0, implicit %assert_sext
+...
+
+---
+name: assert_sext_sgpr_64
+alignment: 4
+legalized: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+
+ ; CHECK-LABEL: name: assert_sext_sgpr_64
+ ; CHECK: liveins: $sgpr0_sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %copy:sreg_64(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY %copy(s64)
+ ; CHECK-NEXT: [[ASSERT_SEXT:%[0-9]+]]:sgpr(s64) = G_ASSERT_SEXT [[COPY]], 4
+ ; CHECK-NEXT: %assert_sext:sreg_64(s64) = COPY [[ASSERT_SEXT]](s64)
+ ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_sext(s64)
+ %copy:sreg_64(s64) = COPY $sgpr0_sgpr1
+ %assert_sext:sreg_64(s64) = G_ASSERT_SEXT %copy, 4
+ S_ENDPGM 0, implicit %assert_sext
+...
+
+---
+name: assert_sext_agpr_64
+alignment: 4
+legalized: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $agpr0_agpr1
+
+ ; CHECK-LABEL: name: assert_sext_agpr_64
+ ; CHECK: liveins: $agpr0_agpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %copy:areg_64(s64) = COPY $agpr0_agpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY %copy(s64)
+ ; CHECK-NEXT: [[ASSERT_SEXT:%[0-9]+]]:vgpr(s64) = G_ASSERT_SEXT [[COPY]], 4
+ ; CHECK-NEXT: %assert_sext:areg_64(s64) = COPY [[ASSERT_SEXT]](s64)
+ ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_sext(s64)
+ %copy:areg_64(s64) = COPY $agpr0_agpr1
+ %assert_sext:areg_64(s64) = G_ASSERT_SEXT %copy, 4
+ S_ENDPGM 0, implicit %assert_sext
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-assert-zext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-assert-zext.mir
index 0bce908..c64a8ec 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-assert-zext.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-assert-zext.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=regbankselect %s -verify-machineinstrs -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=amdgpu-regbankselect,amdgpu-regbanklegalize %s -verify-machineinstrs -o - | FileCheck %s
---
name: assert_zext_vgpr
@@ -53,8 +53,8 @@ body: |
; CHECK-LABEL: name: assert_zext_agpr
; CHECK: liveins: $agpr0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: %copy:agpr(s32) = COPY $agpr0
- ; CHECK-NEXT: %assert_zext:agpr(s32) = G_ASSERT_ZEXT %copy, 4
+ ; CHECK-NEXT: %copy:vgpr(s32) = COPY $agpr0
+ ; CHECK-NEXT: %assert_zext:vgpr(s32) = G_ASSERT_ZEXT %copy, 4
; CHECK-NEXT: S_ENDPGM 0, implicit %assert_zext(s32)
%copy:_(s32) = COPY $agpr0
%assert_zext:_(s32) = G_ASSERT_ZEXT %copy, 4
@@ -74,7 +74,8 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: %copy:vgpr_32(s32) = COPY $vgpr0
- ; CHECK-NEXT: %assert_zext:vgpr(s32) = G_ASSERT_ZEXT %copy, 4
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY %copy(s32)
+ ; CHECK-NEXT: %assert_zext:vgpr(s32) = G_ASSERT_ZEXT [[COPY]], 4
; CHECK-NEXT: S_ENDPGM 0, implicit %assert_zext(s32)
%copy:vgpr_32(s32) = COPY $vgpr0
%assert_zext:_(s32) = G_ASSERT_ZEXT %copy, 4
@@ -94,9 +95,76 @@ body: |
; CHECK: liveins: $sgpr8
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: %copy:sgpr_32(s32) = COPY $sgpr8
- ; CHECK-NEXT: %assert_zext:sgpr(s32) = G_ASSERT_ZEXT %copy, 4
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY %copy(s32)
+ ; CHECK-NEXT: %assert_zext:sgpr(s32) = G_ASSERT_ZEXT [[COPY]], 4
; CHECK-NEXT: S_ENDPGM 0, implicit %assert_zext(s32)
%copy:sgpr_32(s32) = COPY $sgpr8
%assert_zext:_(s32) = G_ASSERT_ZEXT %copy, 4
S_ENDPGM 0, implicit %assert_zext
...
+
+---
+name: assert_zext_vgpr_64
+alignment: 4
+legalized: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+
+ ; CHECK-LABEL: name: assert_zext_vgpr_64
+ ; CHECK: liveins: $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %copy:vreg_64(s64) = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY %copy(s64)
+ ; CHECK-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:vgpr(s64) = G_ASSERT_ZEXT [[COPY]], 4
+ ; CHECK-NEXT: %assert_zext:vreg_64(s64) = COPY [[ASSERT_ZEXT]](s64)
+ ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_zext(s64)
+ %copy:vreg_64(s64) = COPY $vgpr0_vgpr1
+ %assert_zext:vreg_64(s64) = G_ASSERT_ZEXT %copy, 4
+ S_ENDPGM 0, implicit %assert_zext
+...
+
+---
+name: assert_zext_sgpr_64
+alignment: 4
+legalized: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+
+ ; CHECK-LABEL: name: assert_zext_sgpr_64
+ ; CHECK: liveins: $sgpr0_sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %copy:sreg_64(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY %copy(s64)
+ ; CHECK-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:sgpr(s64) = G_ASSERT_ZEXT [[COPY]], 4
+ ; CHECK-NEXT: %assert_zext:sreg_64(s64) = COPY [[ASSERT_ZEXT]](s64)
+ ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_zext(s64)
+ %copy:sreg_64(s64) = COPY $sgpr0_sgpr1
+ %assert_zext:sreg_64(s64) = G_ASSERT_ZEXT %copy, 4
+ S_ENDPGM 0, implicit %assert_zext
+...
+
+---
+name: assert_zext_agpr_64
+alignment: 4
+legalized: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $agpr0_agpr1
+
+ ; CHECK-LABEL: name: assert_zext_agpr_64
+ ; CHECK: liveins: $agpr0_agpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %copy:areg_64(s64) = COPY $agpr0_agpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY %copy(s64)
+ ; CHECK-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:vgpr(s64) = G_ASSERT_ZEXT [[COPY]], 4
+ ; CHECK-NEXT: %assert_zext:areg_64(s64) = COPY [[ASSERT_ZEXT]](s64)
+ ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_zext(s64)
+ %copy:areg_64(s64) = COPY $agpr0_agpr1
+ %assert_zext:areg_64(s64) = G_ASSERT_ZEXT %copy, 4
+ S_ENDPGM 0, implicit %assert_zext
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smax.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smax.mir
index eee553e..a7023d0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smax.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smax.mir
@@ -1,6 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-fast -o - %s | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-greedy -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass='amdgpu-regbankselect,amdgpu-regbanklegalize' -o - %s | FileCheck %s
---
name: smax_s32_ss
@@ -188,8 +187,7 @@ body: |
; CHECK-NEXT: [[ASHR:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST]], [[C]](s32)
; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>)
; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[BITCAST1]], 16
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
- ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST1]], [[C1]](s32)
+ ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST1]], [[C]](s32)
; CHECK-NEXT: [[SMAX:%[0-9]+]]:sgpr(s32) = G_SMAX [[SEXT_INREG]], [[SEXT_INREG1]]
; CHECK-NEXT: [[SMAX1:%[0-9]+]]:sgpr(s32) = G_SMAX [[ASHR]], [[ASHR1]]
; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SMAX]](s32), [[SMAX1]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smin.mir
index ef60aa8..9dd5f45 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smin.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smin.mir
@@ -1,6 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-fast -o - %s | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-greedy -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass='amdgpu-regbankselect,amdgpu-regbanklegalize' -o - %s | FileCheck %s
---
name: smin_s32_ss
@@ -191,8 +190,7 @@ body: |
; CHECK-NEXT: [[ASHR:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST]], [[C]](s32)
; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>)
; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[BITCAST1]], 16
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
- ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST1]], [[C1]](s32)
+ ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST1]], [[C]](s32)
; CHECK-NEXT: [[SMIN:%[0-9]+]]:sgpr(s32) = G_SMIN [[SEXT_INREG]], [[SEXT_INREG1]]
; CHECK-NEXT: [[SMIN1:%[0-9]+]]:sgpr(s32) = G_SMIN [[ASHR]], [[ASHR1]]
; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SMIN]](s32), [[SMIN1]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umax.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umax.mir
index 36a38aa..59d7dce 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umax.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umax.mir
@@ -1,6 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-fast -o - %s | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-greedy -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass='amdgpu-regbankselect,amdgpu-regbanklegalize' -o - %s | FileCheck %s
---
name: umax_s32_ss
@@ -186,15 +185,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr1
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY]](<2 x s16>)
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
- ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST]], [[C1]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST]], [[C]]
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>)
- ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
- ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C2]](s32)
- ; CHECK-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
- ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST1]], [[C3]]
+ ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST1]], [[C]]
+ ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
; CHECK-NEXT: [[UMAX:%[0-9]+]]:sgpr(s32) = G_UMAX [[AND]], [[AND1]]
; CHECK-NEXT: [[UMAX1:%[0-9]+]]:sgpr(s32) = G_UMAX [[LSHR]], [[LSHR1]]
; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[UMAX]](s32), [[UMAX1]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umin.mir
index bb232b5e..fdb05f6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umin.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umin.mir
@@ -1,6 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-fast -o - %s | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-greedy -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass='amdgpu-regbankselect,amdgpu-regbanklegalize' -o - %s | FileCheck %s
---
name: umin_s32_ss
@@ -190,15 +189,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr1
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY]](<2 x s16>)
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
- ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST]], [[C1]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST]], [[C]]
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>)
- ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
- ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C2]](s32)
- ; CHECK-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
- ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST1]], [[C3]]
+ ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST1]], [[C]]
+ ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
; CHECK-NEXT: [[UMIN:%[0-9]+]]:sgpr(s32) = G_UMIN [[AND]], [[AND1]]
; CHECK-NEXT: [[UMIN1:%[0-9]+]]:sgpr(s32) = G_UMIN [[LSHR]], [[LSHR1]]
; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[UMIN]](s32), [[UMIN1]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 9ffc565..4f2c454 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -2537,202 +2537,195 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4
; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1
-; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0
-; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v1
-; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v5
+; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v9, 0
+; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v1
+; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v9
; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc
; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3
-; GISEL-NEXT: v_trunc_f32_e32 v7, v4
-; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v7
-; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v3
-; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v7
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v9, 0
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v4, v12, v3
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; GISEL-NEXT: v_mul_hi_u32 v8, v9, v3
-; GISEL-NEXT: v_mul_hi_u32 v3, v12, v3
-; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7
-; GISEL-NEXT: v_mul_lo_u32 v14, v12, v7
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13
+; GISEL-NEXT: v_trunc_f32_e32 v5, v4
+; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v3
+; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v12, v7, v3
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3
+; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3
+; GISEL-NEXT: v_mul_lo_u32 v13, v7, v4
+; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v7, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8
-; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v3
-; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v9, 0
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v4, v12, v3
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v3
+; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3
; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; GISEL-NEXT: v_mul_hi_u32 v0, v9, v3
-; GISEL-NEXT: v_mul_hi_u32 v3, v12, v3
-; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7
-; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v2
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT: v_mul_lo_u32 v8, v7, v4
+; GISEL-NEXT: v_mul_hi_u32 v0, v7, v3
+; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v4, v12, v7
+; GISEL-NEXT: v_mul_lo_u32 v5, v11, v4
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v8, v7, v4
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v12, v3, vcc
-; GISEL-NEXT: v_mul_lo_u32 v7, 0, v0
-; GISEL-NEXT: v_mul_lo_u32 v8, v10, v4
-; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6
-; GISEL-NEXT: v_mul_hi_u32 v6, v10, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0
+; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc
+; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0
+; GISEL-NEXT: v_mul_lo_u32 v5, v10, v3
+; GISEL-NEXT: v_mul_hi_u32 v7, v10, v0
; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
-; GISEL-NEXT: v_mul_lo_u32 v8, 0, v4
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT: v_mul_hi_u32 v7, v10, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v2
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT: v_mul_lo_u32 v5, 0, v3
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v0, 0
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
-; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v3
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v4, v[7:8]
-; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v5
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v9
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v0, v[7:8]
-; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v10, v6
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0
+; GISEL-NEXT: v_mul_hi_u32 v4, 0, v3
+; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6
+; GISEL-NEXT: v_mov_b32_e32 v5, v8
+; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v3
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v4, v[5:6]
+; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v8
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v0, v[5:6]
+; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v10, v7
; GISEL-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
-; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2
-; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], 0, v7, vcc
-; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 0, v7
-; GISEL-NEXT: v_trunc_f32_e32 v7, v5
-; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v7
+; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v2
+; GISEL-NEXT: v_trunc_f32_e32 v8, v6
+; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v8
; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v2
+; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, v5, vcc
; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v3
; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
-; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v7
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1
-; GISEL-NEXT: v_mov_b32_e32 v2, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[2:3]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, -1, v16, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[6:7]
-; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v10, vcc
-; GISEL-NEXT: v_mul_lo_u32 v7, v15, v5
-; GISEL-NEXT: v_mul_lo_u32 v10, v12, v6
-; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v8, v1
-; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v2, vcc
-; GISEL-NEXT: v_mul_hi_u32 v2, v12, v5
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
+; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v8
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5
+; GISEL-NEXT: v_mov_b32_e32 v2, v7
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v15, v[2:3]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v14, v12, v[7:8]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v8, -1, v2, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v2, v15, v6
+; GISEL-NEXT: v_mul_lo_u32 v10, v12, v7
+; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v5, vcc
+; GISEL-NEXT: v_mul_hi_u32 v5, v12, v6
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v7, v15, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v15, v5
+; GISEL-NEXT: v_mul_lo_u32 v5, v15, v7
+; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2
-; GISEL-NEXT: v_mul_hi_u32 v10, v12, v6
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v10, v12, v7
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
+; GISEL-NEXT: v_mul_hi_u32 v7, v15, v7
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v12, v2
-; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v15, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v0
-; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v4, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v1
-; GISEL-NEXT: v_mov_b32_e32 v1, v6
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v13, v10, v[1:2]
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v14, v7, v[1:2]
-; GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v8, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v12
-; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v15, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v10, v5
-; GISEL-NEXT: v_mul_lo_u32 v14, v7, v1
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
-; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
-; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v13, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v12, v10, v1
-; GISEL-NEXT: v_mul_hi_u32 v5, v10, v5
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v13, v6
-; GISEL-NEXT: v_mul_hi_u32 v13, v7, v1
-; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v12, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13
-; GISEL-NEXT: v_mul_hi_u32 v1, v10, v1
-; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6
-; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v6
-; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5
-; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], v10, v1, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v2
+; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v15, v5, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v10, 0
+; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v9, v1
+; GISEL-NEXT: v_mov_b32_e32 v2, v6
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, v[2:3]
+; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v16, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v10, v[6:7]
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v0
+; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v4, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v1
+; GISEL-NEXT: v_mul_lo_u32 v7, v12, v5
+; GISEL-NEXT: v_mul_lo_u32 v9, v10, v6
+; GISEL-NEXT: v_mul_hi_u32 v14, v10, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15
+; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v14, v12, v6
+; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
+; GISEL-NEXT: v_mul_hi_u32 v9, v10, v6
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v14, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT: v_mul_hi_u32 v6, v12, v6
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v12, v6, vcc
; GISEL-NEXT: v_mul_lo_u32 v6, 0, v5
-; GISEL-NEXT: v_mul_lo_u32 v7, v11, v1
-; GISEL-NEXT: v_mul_hi_u32 v10, v11, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v8, vcc
+; GISEL-NEXT: v_mul_lo_u32 v9, v11, v7
+; GISEL-NEXT: v_mul_hi_u32 v14, v11, v5
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v2
+; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v13, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT: v_mul_lo_u32 v9, 0, v7
; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT: v_mul_lo_u32 v7, 0, v1
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT: v_mul_hi_u32 v10, v11, v1
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14
+; GISEL-NEXT: v_mul_hi_u32 v14, v11, v7
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v5, v6
-; GISEL-NEXT: v_mul_hi_u32 v1, 0, v1
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v10, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v1, v7
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v5, v6
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, 0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
+; GISEL-NEXT: v_mul_hi_u32 v10, 0, v7
; GISEL-NEXT: v_mov_b32_e32 v1, v6
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[1:2]
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
+; GISEL-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v10, v[1:2]
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v10, v[6:7]
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v12, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v9, v[6:7]
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v5
; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v6
; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v6, vcc
@@ -2743,8 +2736,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v10
-; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v12, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v9
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
@@ -2755,8 +2748,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_sdiv_v2i64_24bit:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll
index ac1e11b..dfa613c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx802 < %s | FileCheck -check-prefixes=GFX89,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX89,GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-mesa3d -mcpu=gfx802 < %s | FileCheck -check-prefixes=GFX89,GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX89,GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
define i32 @test_min_max_ValK0_K1_i32(i32 %a) {
; GFX89-LABEL: test_min_max_ValK0_K1_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 82279e6..40b5db0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -3035,203 +3035,193 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4
; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1
-; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v4, 0
-; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1
-; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4
+; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v7, 0
+; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v1
+; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v7
; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc
; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
-; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v3
-; GISEL-NEXT: v_trunc_f32_e32 v5, v5
+; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3
+; GISEL-NEXT: v_trunc_f32_e32 v5, v4
; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5
-; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v3
-; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
-; GISEL-NEXT: v_mov_b32_e32 v3, v8
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v5, v[3:4]
-; GISEL-NEXT: v_mul_lo_u32 v3, v5, v7
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
-; GISEL-NEXT: v_mul_hi_u32 v9, v10, v7
-; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7
-; GISEL-NEXT: v_mul_lo_u32 v13, v10, v8
-; GISEL-NEXT: v_mul_lo_u32 v14, v5, v8
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v13
+; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v3
+; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v12, v8, v3
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3
+; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3
+; GISEL-NEXT: v_mul_lo_u32 v13, v8, v4
+; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
-; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v13, v3
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9
-; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v3
-; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
-; GISEL-NEXT: v_mov_b32_e32 v3, v8
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v5, v[3:4]
-; GISEL-NEXT: v_mul_lo_u32 v3, v5, v7
-; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v0
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
-; GISEL-NEXT: v_mul_hi_u32 v0, v10, v7
-; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7
-; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v3
+; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v9, v8, v3
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5]
+; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0
+; GISEL-NEXT: v_mul_lo_u32 v0, v11, v3
+; GISEL-NEXT: v_mul_lo_u32 v5, v8, v4
+; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v5, v8
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
+; GISEL-NEXT: v_mul_lo_u32 v9, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
+; GISEL-NEXT: v_mul_hi_u32 v5, v8, v4
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
+; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v3, vcc
-; GISEL-NEXT: v_mul_lo_u32 v7, 0, v0
-; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v3, vcc
+; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0
+; GISEL-NEXT: v_mul_lo_u32 v5, v10, v8
; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6
-; GISEL-NEXT: v_mul_hi_u32 v6, v11, v0
+; GISEL-NEXT: v_mul_hi_u32 v6, v10, v0
; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
-; GISEL-NEXT: v_mul_lo_u32 v8, 0, v5
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT: v_mul_hi_u32 v7, v11, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v0, v6
-; GISEL-NEXT: v_mul_hi_u32 v9, 0, v5
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v8, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v0
-; GISEL-NEXT: v_mov_b32_e32 v0, v6
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v7, v[0:1]
-; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v8, v[6:7]
-; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v4
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v11, v5
-; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], 0, v6, vcc
-; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0
-; GISEL-NEXT: v_trunc_f32_e32 v9, v4
-; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v9
-; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v0
-; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], 0, v3
-; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, 0
-; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9
-; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v6
-; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[0:1]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v14, -1, v0, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v0, v9, v4
-; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5
-; GISEL-NEXT: v_mul_hi_u32 v15, v10, v4
-; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v13, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v15, v9, v5
-; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0
-; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT: v_mul_lo_u32 v5, 0, v8
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; GISEL-NEXT: v_mul_hi_u32 v6, v10, v8
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v0
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, 0
-; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v7, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v4
+; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v3
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v9, 0
+; GISEL-NEXT: v_mul_hi_u32 v6, 0, v8
+; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v7
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v0
; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[0:1]
-; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v13, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6]
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v15, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13
-; GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v16, vcc
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v1
-; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v13, vcc
-; GISEL-NEXT: v_mul_lo_u32 v11, v9, v4
-; GISEL-NEXT: v_mul_lo_u32 v12, v10, v5
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v10, v4
-; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4
-; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v11, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v11, v9, v5
-; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v12, v0
-; GISEL-NEXT: v_mul_hi_u32 v12, v10, v5
-; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v11, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12
-; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5
-; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v4, v0
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v6, v[0:1]
+; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v7
+; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v0
+; GISEL-NEXT: v_trunc_f32_e32 v11, v7
+; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v11
+; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v0
+; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v3
+; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v12, 0
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
+; GISEL-NEXT: v_mov_b32_e32 v0, v8
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1]
+; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v12, v[8:9]
+; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v10, v4
+; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, v5, vcc
+; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], 0, v5
+; GISEL-NEXT: v_mul_lo_u32 v4, v11, v7
+; GISEL-NEXT: v_mul_lo_u32 v5, v12, v8
+; GISEL-NEXT: v_mul_hi_u32 v9, v12, v7
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v9
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v11, v4
+; GISEL-NEXT: v_mul_lo_u32 v9, v11, v8
+; GISEL-NEXT: v_mul_hi_u32 v7, v11, v7
; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4
-; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v10, v0
-; GISEL-NEXT: v_addc_u32_e64 v4, s[4:5], v9, v4, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v5, 0, v0
-; GISEL-NEXT: v_mul_lo_u32 v9, v2, v4
-; GISEL-NEXT: v_cndmask_b32_e32 v10, v13, v6, vcc
-; GISEL-NEXT: v_mul_hi_u32 v6, v2, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
-; GISEL-NEXT: v_mul_lo_u32 v9, 0, v4
-; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT: v_mul_hi_u32 v6, v2, v4
+; GISEL-NEXT: v_mul_hi_u32 v5, v12, v8
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7
+; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5
+; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v4
+; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v11, v5, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v7, 0
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
+; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v0, vcc
+; GISEL-NEXT: v_mov_b32_e32 v0, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v9, -1, v6, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v8, v[0:1]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v1
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v7, v[5:6]
+; GISEL-NEXT: v_mul_lo_u32 v12, v8, v4
+; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v11, vcc
+; GISEL-NEXT: v_mul_lo_u32 v13, v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v14, v7, v4
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11
+; GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v14, v8, v5
+; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_mul_hi_u32 v13, v7, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v8, v5, vcc
+; GISEL-NEXT: v_mul_lo_u32 v5, 0, v4
+; GISEL-NEXT: v_mul_lo_u32 v8, v2, v7
+; GISEL-NEXT: v_mul_hi_u32 v13, v2, v4
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v0, v1
+; GISEL-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v11, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT: v_mul_lo_u32 v8, 0, v7
+; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13
+; GISEL-NEXT: v_mul_hi_u32 v13, v2, v7
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v5
-; GISEL-NEXT: v_mul_hi_u32 v11, 0, v4
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v0
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v4, v5
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v8, 0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; GISEL-NEXT: v_mul_hi_u32 v6, 0, v7
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GISEL-NEXT: v_mov_b32_e32 v0, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc
; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v6, v[0:1]
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v10, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v8, v[5:6]
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v7, vcc
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v5, vcc
; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index 4de1078..ded985e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -2053,90 +2053,82 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_mul_lo_u32 v12, v6, v2
; GISEL-NEXT: v_mul_lo_u32 v13, 0, v2
; GISEL-NEXT: v_mul_hi_u32 v14, v6, v2
-; GISEL-NEXT: v_mul_hi_u32 v2, 0, v2
-; GISEL-NEXT: v_mul_lo_u32 v15, v0, v5
+; GISEL-NEXT: v_mul_hi_u32 v15, 0, v2
+; GISEL-NEXT: v_mul_lo_u32 v2, v0, v5
; GISEL-NEXT: v_mul_lo_u32 v16, 0, v5
; GISEL-NEXT: v_mul_hi_u32 v17, v0, v5
; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15
+; GISEL-NEXT: v_mul_lo_u32 v12, v3, v15
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7
+; GISEL-NEXT: v_mul_lo_u32 v10, v1, v5
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT: v_mul_lo_u32 v9, v3, v4
-; GISEL-NEXT: v_mul_lo_u32 v12, 0, v4
-; GISEL-NEXT: v_mul_hi_u32 v13, v3, v4
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7
-; GISEL-NEXT: v_mul_lo_u32 v14, 0, v7
-; GISEL-NEXT: v_mul_hi_u32 v15, v1, v7
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v2, v8
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT: v_mul_lo_u32 v2, v3, v8
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v4
-; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v8, vcc
-; GISEL-NEXT: v_mul_lo_u32 v17, v1, v5
-; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v7
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2
+; GISEL-NEXT: v_mul_lo_u32 v7, v3, v4
+; GISEL-NEXT: v_mul_lo_u32 v8, 0, v4
+; GISEL-NEXT: v_mul_hi_u32 v9, v3, v4
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v4
+; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v15, vcc
+; GISEL-NEXT: v_mul_lo_u32 v14, v1, v2
+; GISEL-NEXT: v_mul_lo_u32 v16, 0, v2
+; GISEL-NEXT: v_mul_hi_u32 v17, v1, v2
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v2
; GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v5, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v17
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v10
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15
-; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], 0, v2, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v16, v10
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v11
+; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v13, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v17
+; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], 0, v8, vcc
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v11
-; GISEL-NEXT: v_subb_u32_e64 v11, s[6:7], 0, v12, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v14
+; GISEL-NEXT: v_subb_u32_e64 v14, s[6:7], 0, v9, s[4:5]
; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[6:7]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v9
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v11
-; GISEL-NEXT: v_add_i32_e64 v9, s[10:11], 1, v18
-; GISEL-NEXT: v_addc_u32_e64 v11, s[10:11], 0, v19, s[10:11]
-; GISEL-NEXT: v_sub_i32_e64 v2, s[10:11], 0, v2
-; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], 0, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v13, s[6:7]
-; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v15, s[8:9]
-; GISEL-NEXT: v_subbrev_u32_e64 v12, vcc, 0, v12, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[6:7]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v14
+; GISEL-NEXT: v_add_i32_e64 v7, s[10:11], 1, v18
+; GISEL-NEXT: v_addc_u32_e64 v14, s[10:11], 0, v19, s[10:11]
+; GISEL-NEXT: v_sub_i32_e64 v8, s[10:11], 0, v8
+; GISEL-NEXT: v_sub_i32_e64 v9, s[10:11], 0, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v10, s[6:7]
+; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v17, -1, v17, s[8:9]
+; GISEL-NEXT: v_subbrev_u32_e64 v9, vcc, 0, v9, s[4:5]
; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v3
-; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
-; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v12, vcc
+; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v9, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v3, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6
; GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v14, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc
; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v9, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v3, v18, v7, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e32 v6, v13, v16, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v4, v19, v11, s[4:5]
-; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v4, v19, v14, s[4:5]
+; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v17
+; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v6, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[4:5]
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll
index 2b54123..f5068f5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx802 < %s | FileCheck -check-prefixes=GFX89,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX89,GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-mesa3d -mcpu=gfx802 < %s | FileCheck -check-prefixes=GFX89,GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX89,GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
define i32 @test_min_max_ValK0_K1_u32(i32 %a) {
; GFX89-LABEL: test_min_max_ValK0_K1_u32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index a41ec8e..be5543b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -2058,42 +2058,34 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8
+; GISEL-NEXT: v_mul_lo_u32 v2, v3, v2
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7
+; GISEL-NEXT: v_mul_lo_u32 v5, v1, v5
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v11
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT: v_mul_lo_u32 v9, v3, v4
-; GISEL-NEXT: v_mul_lo_u32 v12, 0, v4
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT: v_mul_lo_u32 v8, v3, v4
+; GISEL-NEXT: v_mul_lo_u32 v9, 0, v4
; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7
-; GISEL-NEXT: v_mul_lo_u32 v13, 0, v7
+; GISEL-NEXT: v_mul_lo_u32 v10, v1, v7
+; GISEL-NEXT: v_mul_lo_u32 v11, 0, v7
; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT: v_mul_lo_u32 v2, v3, v2
-; GISEL-NEXT: v_mul_lo_u32 v5, v1, v5
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v7
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v6, v9
+; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v6, v8
; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], 0, v2, vcc
; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], 0, v2
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v11
+; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v10
; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], 0, v4, s[4:5]
; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], 0, v4
; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis-heuristics.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis-heuristics.ll
index c94b333..1f36902 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis-heuristics.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis-heuristics.ll
@@ -726,16 +726,16 @@ define amdgpu_kernel void @used_by_unbreakable_and_breakable_phi(<5 x double> %i
; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE815:%.*]] = extractelement <5 x double> [[LARGEPHI_INSERTSLICE4]], i64 4
; CHECK-NEXT: br label [[END]]
; CHECK: end:
-; CHECK-NEXT: [[TMP5:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE01]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[FINALLY]] ]
-; CHECK-NEXT: [[TMP6:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE22]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[FINALLY]] ]
-; CHECK-NEXT: [[TMP7:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE43]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[FINALLY]] ]
-; CHECK-NEXT: [[TMP8:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE64]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[FINALLY]] ]
-; CHECK-NEXT: [[TMP9:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE85]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[FINALLY]] ]
-; CHECK-NEXT: [[TMP10:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE011]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
-; CHECK-NEXT: [[TMP11:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE212]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
-; CHECK-NEXT: [[TMP12:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE413]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
-; CHECK-NEXT: [[TMP13:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE614]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
-; CHECK-NEXT: [[TMP14:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE815]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE01]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE22]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
+; CHECK-NEXT: [[TMP7:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE43]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
+; CHECK-NEXT: [[TMP8:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE64]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
+; CHECK-NEXT: [[TMP9:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE85]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE011]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[FINALLY]] ]
+; CHECK-NEXT: [[TMP11:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE212]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[FINALLY]] ]
+; CHECK-NEXT: [[TMP12:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE413]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[FINALLY]] ]
+; CHECK-NEXT: [[TMP13:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE614]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[FINALLY]] ]
+; CHECK-NEXT: [[TMP14:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE815]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[FINALLY]] ]
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE016:%.*]] = insertelement <5 x double> poison, double [[TMP10]], i64 0
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE117:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE016]], double [[TMP11]], i64 1
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE218:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE117]], double [[TMP12]], i64 2
@@ -746,8 +746,8 @@ define amdgpu_kernel void @used_by_unbreakable_and_breakable_phi(<5 x double> %i
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE28:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE17]], double [[TMP7]], i64 2
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE39:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE28]], double [[TMP8]], i64 3
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE410:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE39]], double [[TMP9]], i64 4
-; CHECK-NEXT: store <5 x double> [[LARGEPHI_INSERTSLICE410]], ptr [[OUT]], align 1
; CHECK-NEXT: store <5 x double> [[LARGEPHI_INSERTSLICE420]], ptr [[OUT]], align 1
+; CHECK-NEXT: store <5 x double> [[LARGEPHI_INSERTSLICE410]], ptr [[OUT]], align 1
; CHECK-NEXT: ret void
;
entry:
@@ -1187,11 +1187,11 @@ define amdgpu_kernel void @test_breakable_chain_5_out_of_7(<5 x double> %in, ptr
; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE960:%.*]] = extractelement <5 x double> [[IN]], i64 4
; CHECK-NEXT: br i1 [[COND]], label [[END:%.*]], label [[COND5_END]]
; CHECK: cond5.end:
-; CHECK-NEXT: [[TMP25:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE041]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[COND5_TRUE]] ]
-; CHECK-NEXT: [[TMP26:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE242]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[COND5_TRUE]] ]
-; CHECK-NEXT: [[TMP27:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE443]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[COND5_TRUE]] ]
-; CHECK-NEXT: [[TMP28:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE644]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[COND5_TRUE]] ]
-; CHECK-NEXT: [[TMP29:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE845]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[COND5_TRUE]] ]
+; CHECK-NEXT: [[TMP25:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE041]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE152]], [[COND5_TRUE]] ]
+; CHECK-NEXT: [[TMP26:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE242]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE354]], [[COND5_TRUE]] ]
+; CHECK-NEXT: [[TMP27:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE443]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE556]], [[COND5_TRUE]] ]
+; CHECK-NEXT: [[TMP28:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE644]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE758]], [[COND5_TRUE]] ]
+; CHECK-NEXT: [[TMP29:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE845]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE960]], [[COND5_TRUE]] ]
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE046:%.*]] = insertelement <5 x double> poison, double [[TMP25]], i64 0
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE147:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE046]], double [[TMP26]], i64 1
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE248:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE147]], double [[TMP27]], i64 2
@@ -1204,11 +1204,11 @@ define amdgpu_kernel void @test_breakable_chain_5_out_of_7(<5 x double> %in, ptr
; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE859:%.*]] = extractelement <5 x double> [[LARGEPHI_INSERTSLICE450]], i64 4
; CHECK-NEXT: br label [[END]]
; CHECK: end:
-; CHECK-NEXT: [[TMP30:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE051]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE152]], [[COND5_TRUE]] ]
-; CHECK-NEXT: [[TMP31:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE253]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE354]], [[COND5_TRUE]] ]
-; CHECK-NEXT: [[TMP32:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE455]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE556]], [[COND5_TRUE]] ]
-; CHECK-NEXT: [[TMP33:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE657]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE758]], [[COND5_TRUE]] ]
-; CHECK-NEXT: [[TMP34:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE859]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE960]], [[COND5_TRUE]] ]
+; CHECK-NEXT: [[TMP30:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE051]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[COND5_TRUE]] ]
+; CHECK-NEXT: [[TMP31:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE253]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[COND5_TRUE]] ]
+; CHECK-NEXT: [[TMP32:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE455]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[COND5_TRUE]] ]
+; CHECK-NEXT: [[TMP33:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE657]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[COND5_TRUE]] ]
+; CHECK-NEXT: [[TMP34:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE859]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[COND5_TRUE]] ]
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE061:%.*]] = insertelement <5 x double> poison, double [[TMP30]], i64 0
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE162:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE061]], double [[TMP31]], i64 1
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE263:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE162]], double [[TMP32]], i64 2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
index a4f9ce3..7ff86ac 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
@@ -2160,7 +2160,22 @@ define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x flo
; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP27]], i64 0
; IEEE-GOODFREXP-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP37]], i64 1
; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[MD_1ULP_UNDEF]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-GOODFREXP-NEXT: [[SQRT_X_3ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META3:![0-9]+]]
+; IEEE-GOODFREXP-NEXT: [[TMP56:%.*]] = extractelement <2 x float> [[X]], i64 0
+; IEEE-GOODFREXP-NEXT: [[TMP57:%.*]] = extractelement <2 x float> [[X]], i64 1
+; IEEE-GOODFREXP-NEXT: [[TMP58:%.*]] = fcmp olt float [[TMP56]], 0x3810000000000000
+; IEEE-GOODFREXP-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], i32 32, i32 0
+; IEEE-GOODFREXP-NEXT: [[TMP60:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP56]], i32 [[TMP59]])
+; IEEE-GOODFREXP-NEXT: [[TMP61:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP60]])
+; IEEE-GOODFREXP-NEXT: [[TMP62:%.*]] = select i1 [[TMP58]], i32 -16, i32 0
+; IEEE-GOODFREXP-NEXT: [[TMP63:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP61]], i32 [[TMP62]])
+; IEEE-GOODFREXP-NEXT: [[TMP64:%.*]] = fcmp olt float [[TMP57]], 0x3810000000000000
+; IEEE-GOODFREXP-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], i32 32, i32 0
+; IEEE-GOODFREXP-NEXT: [[TMP66:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP57]], i32 [[TMP65]])
+; IEEE-GOODFREXP-NEXT: [[TMP67:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP66]])
+; IEEE-GOODFREXP-NEXT: [[TMP68:%.*]] = select i1 [[TMP64]], i32 -16, i32 0
+; IEEE-GOODFREXP-NEXT: [[TMP69:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP67]], i32 [[TMP68]])
+; IEEE-GOODFREXP-NEXT: [[TMP70:%.*]] = insertelement <2 x float> poison, float [[TMP63]], i64 0
+; IEEE-GOODFREXP-NEXT: [[SQRT_X_3ULP:%.*]] = insertelement <2 x float> [[TMP70]], float [[TMP69]], i64 1
; IEEE-GOODFREXP-NEXT: [[TMP39:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 0
; IEEE-GOODFREXP-NEXT: [[TMP40:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 1
; IEEE-GOODFREXP-NEXT: [[TMP41:%.*]] = extractelement <2 x float> [[X]], i64 0
@@ -2231,7 +2246,22 @@ define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x flo
; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP27]], i64 0
; IEEE-BADFREXP-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP37]], i64 1
; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[MD_1ULP_UNDEF]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-BADFREXP-NEXT: [[SQRT_X_3ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META3:![0-9]+]]
+; IEEE-BADFREXP-NEXT: [[TMP56:%.*]] = extractelement <2 x float> [[X]], i64 0
+; IEEE-BADFREXP-NEXT: [[TMP57:%.*]] = extractelement <2 x float> [[X]], i64 1
+; IEEE-BADFREXP-NEXT: [[TMP58:%.*]] = fcmp olt float [[TMP56]], 0x3810000000000000
+; IEEE-BADFREXP-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], i32 32, i32 0
+; IEEE-BADFREXP-NEXT: [[TMP60:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP56]], i32 [[TMP59]])
+; IEEE-BADFREXP-NEXT: [[TMP61:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP60]])
+; IEEE-BADFREXP-NEXT: [[TMP62:%.*]] = select i1 [[TMP58]], i32 -16, i32 0
+; IEEE-BADFREXP-NEXT: [[TMP63:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP61]], i32 [[TMP62]])
+; IEEE-BADFREXP-NEXT: [[TMP64:%.*]] = fcmp olt float [[TMP57]], 0x3810000000000000
+; IEEE-BADFREXP-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], i32 32, i32 0
+; IEEE-BADFREXP-NEXT: [[TMP66:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP57]], i32 [[TMP65]])
+; IEEE-BADFREXP-NEXT: [[TMP67:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP66]])
+; IEEE-BADFREXP-NEXT: [[TMP68:%.*]] = select i1 [[TMP64]], i32 -16, i32 0
+; IEEE-BADFREXP-NEXT: [[TMP69:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP67]], i32 [[TMP68]])
+; IEEE-BADFREXP-NEXT: [[TMP70:%.*]] = insertelement <2 x float> poison, float [[TMP63]], i64 0
+; IEEE-BADFREXP-NEXT: [[SQRT_X_3ULP:%.*]] = insertelement <2 x float> [[TMP70]], float [[TMP69]], i64 1
; IEEE-BADFREXP-NEXT: [[TMP39:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 0
; IEEE-BADFREXP-NEXT: [[TMP40:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 1
; IEEE-BADFREXP-NEXT: [[TMP41:%.*]] = extractelement <2 x float> [[X]], i64 0
@@ -2258,7 +2288,12 @@ define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x flo
; DAZ-NEXT: [[SQRT_X_NO_MD:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]])
; DAZ-NEXT: [[NO_MD:%.*]] = fdiv contract <2 x float> splat (float 1.000000e+00), [[SQRT_X_NO_MD]]
; DAZ-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT: [[SQRT_MD_1ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META2:![0-9]+]]
+; DAZ-NEXT: [[TMP39:%.*]] = extractelement <2 x float> [[X]], i64 0
+; DAZ-NEXT: [[TMP40:%.*]] = extractelement <2 x float> [[X]], i64 1
+; DAZ-NEXT: [[TMP41:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP39]])
+; DAZ-NEXT: [[TMP42:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP40]])
+; DAZ-NEXT: [[TMP43:%.*]] = insertelement <2 x float> poison, float [[TMP41]], i64 0
+; DAZ-NEXT: [[SQRT_MD_1ULP:%.*]] = insertelement <2 x float> [[TMP43]], float [[TMP42]], i64 1
; DAZ-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP]], i64 0
; DAZ-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP]], i64 1
; DAZ-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[X]], i64 0
@@ -2276,7 +2311,9 @@ define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x flo
; DAZ-NEXT: [[SQRT_MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP11]], i64 1
; DAZ-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 0
; DAZ-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 1
-; DAZ-NEXT: [[TMP15:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]])
+; DAZ-NEXT: [[TMP44:%.*]] = extractelement <2 x float> [[X]], i64 0
+; DAZ-NEXT: [[TMP45:%.*]] = extractelement <2 x float> [[X]], i64 1
+; DAZ-NEXT: [[TMP15:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP44]])
; DAZ-NEXT: [[TMP16:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP14]])
; DAZ-NEXT: [[TMP17:%.*]] = extractvalue { float, i32 } [[TMP16]], 0
; DAZ-NEXT: [[TMP18:%.*]] = extractvalue { float, i32 } [[TMP16]], 1
@@ -2290,7 +2327,12 @@ define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x flo
; DAZ-NEXT: [[TMP26:%.*]] = insertelement <2 x float> poison, float [[TMP15]], i64 0
; DAZ-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP26]], float [[TMP25]], i64 1
; DAZ-NEXT: store volatile <2 x float> [[MD_1ULP_UNDEF]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT: [[SQRT_X_3ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META3:![0-9]+]]
+; DAZ-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[X]], i64 0
+; DAZ-NEXT: [[TMP35:%.*]] = extractelement <2 x float> [[X]], i64 1
+; DAZ-NEXT: [[TMP36:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP34]])
+; DAZ-NEXT: [[TMP37:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP35]])
+; DAZ-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP36]], i64 0
+; DAZ-NEXT: [[SQRT_X_3ULP:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP37]], i64 1
; DAZ-NEXT: [[TMP27:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 0
; DAZ-NEXT: [[TMP28:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 1
; DAZ-NEXT: [[TMP29:%.*]] = extractelement <2 x float> [[X]], i64 0
@@ -3200,9 +3242,13 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator(<4 x float> %arg) {
; DAZ-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[DENOM]], i64 1
; DAZ-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[DENOM]], i64 2
; DAZ-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[DENOM]], i64 3
-; DAZ-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP12]])
-; DAZ-NEXT: [[TMP17:%.*]] = fneg contract float [[TMP13]]
-; DAZ-NEXT: [[TMP18:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP17]])
+; DAZ-NEXT: [[TMP42:%.*]] = extractelement <4 x float> [[ARG]], i64 0
+; DAZ-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[ARG]], i64 1
+; DAZ-NEXT: [[TMP43:%.*]] = extractelement <4 x float> [[ARG]], i64 2
+; DAZ-NEXT: [[TMP44:%.*]] = extractelement <4 x float> [[ARG]], i64 3
+; DAZ-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP42]])
+; DAZ-NEXT: [[TMP45:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP17]])
+; DAZ-NEXT: [[TMP18:%.*]] = fneg contract float [[TMP45]]
; DAZ-NEXT: [[TMP19:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP14]])
; DAZ-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP19]], 0
; DAZ-NEXT: [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP19]], 1
@@ -3675,9 +3721,13 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp(<4 x float> %ar
; DAZ-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[DENOM]], i64 1
; DAZ-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[DENOM]], i64 2
; DAZ-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[DENOM]], i64 3
-; DAZ-NEXT: [[TMP16:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP12]])
-; DAZ-NEXT: [[TMP17:%.*]] = fneg arcp contract float [[TMP13]]
-; DAZ-NEXT: [[TMP18:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP17]])
+; DAZ-NEXT: [[TMP26:%.*]] = extractelement <4 x float> [[ARG]], i64 0
+; DAZ-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[ARG]], i64 1
+; DAZ-NEXT: [[TMP27:%.*]] = extractelement <4 x float> [[ARG]], i64 2
+; DAZ-NEXT: [[TMP28:%.*]] = extractelement <4 x float> [[ARG]], i64 3
+; DAZ-NEXT: [[TMP16:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP26]])
+; DAZ-NEXT: [[TMP29:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP17]])
+; DAZ-NEXT: [[TMP18:%.*]] = fneg arcp contract float [[TMP29]]
; DAZ-NEXT: [[TMP19:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP14]])
; DAZ-NEXT: [[TMP20:%.*]] = fmul arcp contract float 4.000000e+00, [[TMP19]]
; DAZ-NEXT: [[TMP21:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP15]])
@@ -3850,19 +3900,9 @@ define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float
; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SQRT]], i64 1
; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[SQRT]], i64 2
; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[SQRT]], i64 3
-; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP8]])
-; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0
-; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP12]], 1
-; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]]
-; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]])
-; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]])
-; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = fneg contract float [[TMP9]]
-; IEEE-GOODFREXP-NEXT: [[TMP48:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP18]])
-; IEEE-GOODFREXP-NEXT: [[TMP49:%.*]] = extractvalue { float, i32 } [[TMP48]], 0
-; IEEE-GOODFREXP-NEXT: [[TMP50:%.*]] = extractvalue { float, i32 } [[TMP48]], 1
-; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = sub i32 0, [[TMP50]]
-; IEEE-GOODFREXP-NEXT: [[TMP51:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP49]])
-; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP51]], i32 [[TMP22]])
+; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 4.000000e+00)
+; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 2.000000e+00)
+; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = fneg contract float [[TMP13]]
; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]])
; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP29]], 0
; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = extractvalue { float, i32 } [[TMP29]], 1
@@ -3903,19 +3943,9 @@ define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float
; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SQRT]], i64 1
; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[SQRT]], i64 2
; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[SQRT]], i64 3
-; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP8]])
-; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0
-; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP8]])
-; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]]
-; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]])
-; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]])
-; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = fneg contract float [[TMP9]]
-; IEEE-BADFREXP-NEXT: [[TMP48:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP18]])
-; IEEE-BADFREXP-NEXT: [[TMP49:%.*]] = extractvalue { float, i32 } [[TMP48]], 0
-; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP18]])
-; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = sub i32 0, [[TMP21]]
-; IEEE-BADFREXP-NEXT: [[TMP50:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP49]])
-; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP50]], i32 [[TMP22]])
+; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 4.000000e+00)
+; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 2.000000e+00)
+; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = fneg contract float [[TMP13]]
; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]])
; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP29]], 0
; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP10]])
@@ -3956,9 +3986,9 @@ define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float
; DAZ-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SQRT]], i64 1
; DAZ-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[SQRT]], i64 2
; DAZ-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[SQRT]], i64 3
-; DAZ-NEXT: [[TMP12:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP8]])
-; DAZ-NEXT: [[TMP13:%.*]] = fneg contract float [[TMP9]]
-; DAZ-NEXT: [[TMP14:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]])
+; DAZ-NEXT: [[TMP12:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 4.000000e+00)
+; DAZ-NEXT: [[TMP13:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 2.000000e+00)
+; DAZ-NEXT: [[TMP14:%.*]] = fneg contract float [[TMP13]]
; DAZ-NEXT: [[TMP15:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]])
; DAZ-NEXT: [[TMP16:%.*]] = extractvalue { float, i32 } [[TMP15]], 0
; DAZ-NEXT: [[TMP17:%.*]] = extractvalue { float, i32 } [[TMP15]], 1
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 4b14dc6..7ee0015f 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -21204,18 +21204,14 @@ define bfloat @v_fabs_bf16(bfloat %a) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fabs_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fabs_bf16:
@@ -21440,10 +21436,7 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_or_b32_e32 v0, 0x80000000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fneg_fabs_bf16:
@@ -21451,10 +21444,7 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_or_b32_e32 v0, 0x80000000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fneg_fabs_bf16:
@@ -21510,23 +21500,17 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
; GCN-LABEL: s_fneg_fabs_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GCN-NEXT: s_bitset0_b32 s0, 31
-; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GCN-NEXT: s_xor_b32 s0, s0, 0x80000000
-; GCN-NEXT: s_lshr_b32 s0, s0, 16
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_fneg_fabs_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
-; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX7-NEXT: s_bitset0_b32 s0, 31
-; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX7-NEXT: s_xor_b32 s0, s0, 0x80000000
-; GFX7-NEXT: s_lshr_b32 s0, s0, 16
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_fneg_fabs_bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-avoid-coalesce-class-with-no-registers.ll b/llvm/test/CodeGen/AMDGPU/coalescer-avoid-coalesce-class-with-no-registers.ll
new file mode 100644
index 0000000..f466513
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-avoid-coalesce-class-with-no-registers.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+; Make sure the coalescer doesn't introduce any uses of
+; vreg_1024. None are available to allocate with the register budget
+; of this function.
+
+define void @no_introduce_vreg_1024() #0 {
+; CHECK-LABEL: no_introduce_vreg_1024:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[0:7]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_mov_b32_e32 v9, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:15]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %tuple = call <8 x i32> asm sideeffect "; def $0","=v"()
+ %sub0 = extractelement <8 x i32> %tuple, i32 0
+ %insert = insertelement <16 x i32> poison, i32 %sub0, i32 9
+ call void asm sideeffect "; use $0","v"(<16 x i32> %insert)
+ ret void
+}
+
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="10,10" }
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-avoid-coalesce-class-with-no-registers.mir b/llvm/test/CodeGen/AMDGPU/coalescer-avoid-coalesce-class-with-no-registers.mir
new file mode 100644
index 0000000..1f414eb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-avoid-coalesce-class-with-no-registers.mir
@@ -0,0 +1,34 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=register-coalescer -o - %s | FileCheck %s
+
+# The register budget for this function does not permit using 1024-bit
+# registers. The coalescer should not introduce a 1024-bit virtual
+# register which will fail to allocate.
+
+--- |
+ define void @no_introduce_vreg_1024() #0 {
+ ret void
+ }
+
+ attributes #0 = { "amdgpu-waves-per-eu"="10,10" }
+...
+---
+name: no_introduce_vreg_1024
+tracksRegLiveness: true
+machineFunctionInfo:
+ occupancy: 10
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+
+ ; CHECK-LABEL: name: no_introduce_vreg_1024
+ ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub9:vreg_512 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: SI_RETURN implicit [[COPY1]]
+ %0:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ undef %1.sub9:vreg_512 = COPY %0.sub0
+ SI_RETURN implicit %1
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
index 5d184b1..c46fcde 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
@@ -218,19 +218,11 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_and_b32 s4, s3, 0xffff0000
-; CI-NEXT: s_lshl_b32 s3, s3, 16
-; CI-NEXT: s_and_b32 s5, s2, 0xffff0000
-; CI-NEXT: v_mul_f32_e64 v0, 1.0, |s4|
-; CI-NEXT: v_mul_f32_e64 v1, 1.0, |s3|
-; CI-NEXT: v_mul_f32_e64 v2, 1.0, |s5|
-; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; CI-NEXT: s_lshl_b32 s2, s2, 16
-; CI-NEXT: v_alignbit_b32 v1, v0, v1, 16
-; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; CI-NEXT: v_mul_f32_e64 v2, 1.0, |s2|
-; CI-NEXT: v_alignbit_b32 v0, v0, v2, 16
+; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff
+; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
@@ -537,16 +529,15 @@ define amdgpu_kernel void @v_fabs_fold_self_v2bf16(ptr addrspace(1) %out, ptr ad
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; CI-NEXT: v_mul_f32_e64 v4, 1.0, |v3|
-; CI-NEXT: v_mul_f32_e64 v5, 1.0, |v2|
-; CI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; CI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; CI-NEXT: v_mul_f32_e32 v3, v4, v3
-; CI-NEXT: v_mul_f32_e32 v2, v5, v2
-; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; CI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; CI-NEXT: v_and_b32_e32 v3, 0x7fff, v2
+; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; CI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; CI-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_mul_f32_e32 v2, v2, v5
+; CI-NEXT: v_mul_f32_e32 v3, v3, v4
+; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; CI-NEXT: v_alignbit_b32 v2, v2, v3, 16
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
@@ -898,16 +889,13 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2bf16(ptr addrspace(1) %in) #0 {
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dword v0, v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; CI-NEXT: v_mul_f32_e64 v1, 1.0, |v1|
-; CI-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
-; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; CI-NEXT: v_mul_f32_e32 v1, 4.0, v1
+; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v0
+; CI-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; CI-NEXT: v_add_f32_e32 v0, 2.0, v0
-; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; CI-NEXT: v_mul_f32_e32 v1, 4.0, v1
; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; CI-NEXT: flat_store_short v[0:1], v1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: flat_store_short v[0:1], v0
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
index 3983655..38239c5 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
@@ -1634,29 +1634,18 @@ define float @v_recip_sqrt_f32_ulp25_contract(float %x) {
; IR-IEEE-SDAG-LABEL: v_recip_sqrt_f32_ulp25_contract:
; IR-IEEE-SDAG: ; %bb.0:
; IR-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; IR-IEEE-SDAG-NEXT: s_mov_b32 s4, 0xf800000
-; IR-IEEE-SDAG-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
+; IR-IEEE-SDAG-NEXT: s_mov_b32 s4, 0x800000
; IR-IEEE-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; IR-IEEE-SDAG-NEXT: v_sqrt_f32_e32 v1, v0
-; IR-IEEE-SDAG-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
-; IR-IEEE-SDAG-NEXT: v_fma_f32 v3, -v2, v1, v0
-; IR-IEEE-SDAG-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
-; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; IR-IEEE-SDAG-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
-; IR-IEEE-SDAG-NEXT: v_fma_f32 v1, -v3, v1, v0
-; IR-IEEE-SDAG-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
-; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; IR-IEEE-SDAG-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; IR-IEEE-SDAG-NEXT: v_mov_b32_e32 v2, 0x260
-; IR-IEEE-SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc
+; IR-IEEE-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; IR-IEEE-SDAG-NEXT: v_sqrt_f32_e32 v0, v0
+; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc
+; IR-IEEE-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
; IR-IEEE-SDAG-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
; IR-IEEE-SDAG-NEXT: v_rcp_f32_e32 v2, v1
-; IR-IEEE-SDAG-NEXT: v_fma_f32 v3, -v1, v2, 1.0
-; IR-IEEE-SDAG-NEXT: v_fma_f32 v2, v3, v2, v2
; IR-IEEE-SDAG-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; IR-IEEE-SDAG-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; IR-IEEE-SDAG-NEXT: v_fma_f32 v2, v4, v2, v2
; IR-IEEE-SDAG-NEXT: v_mul_f32_e32 v4, v3, v2
; IR-IEEE-SDAG-NEXT: v_fma_f32 v5, -v1, v4, v3
; IR-IEEE-SDAG-NEXT: v_fma_f32 v4, v5, v2, v4
@@ -1668,24 +1657,14 @@ define float @v_recip_sqrt_f32_ulp25_contract(float %x) {
; IR-IEEE-GISEL-LABEL: v_recip_sqrt_f32_ulp25_contract:
; IR-IEEE-GISEL: ; %bb.0:
; IR-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; IR-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000
-; IR-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
+; IR-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
; IR-IEEE-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; IR-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v1, v0
-; IR-IEEE-GISEL-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
-; IR-IEEE-GISEL-NEXT: v_fma_f32 v3, -v2, v1, v0
-; IR-IEEE-GISEL-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1
-; IR-IEEE-GISEL-NEXT: v_fma_f32 v5, -v4, v1, v0
-; IR-IEEE-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
-; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5]
-; IR-IEEE-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5
-; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5]
-; IR-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; IR-IEEE-GISEL-NEXT: v_mov_b32_e32 v2, 0x260
-; IR-IEEE-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; IR-IEEE-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; IR-IEEE-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; IR-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v0, v0
+; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc
+; IR-IEEE-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
; IR-IEEE-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
; IR-IEEE-GISEL-NEXT: v_rcp_f32_e32 v2, v1
; IR-IEEE-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
@@ -1705,75 +1684,24 @@ define float @v_recip_sqrt_f32_ulp25_contract(float %x) {
; CODEGEN-DAZ-NEXT: v_rsq_f32_e32 v0, v0
; CODEGEN-DAZ-NEXT: s_setpc_b64 s[30:31]
;
-; IR-DAZ-SDAG-LABEL: v_recip_sqrt_f32_ulp25_contract:
-; IR-DAZ-SDAG: ; %bb.0:
-; IR-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; IR-DAZ-SDAG-NEXT: s_mov_b32 s4, 0xf800000
-; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; IR-DAZ-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; IR-DAZ-SDAG-NEXT: v_rsq_f32_e32 v1, v0
-; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, v0, v1
-; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v3, -v1, v2, 0.5
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v2, v2, v3, v2
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v4, -v2, v2, v0
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, v1, v3, v1
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, v4, v1, v2
-; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; IR-DAZ-SDAG-NEXT: v_mov_b32_e32 v2, 0x260
-; IR-DAZ-SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; IR-DAZ-SDAG-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
-; IR-DAZ-SDAG-NEXT: v_rcp_f32_e32 v2, v1
-; IR-DAZ-SDAG-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
-; IR-DAZ-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v2, v4, v2, v2
-; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v4, v3, v2
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v5, -v1, v4, v3
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v4, v5, v2, v4
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, -v1, v4, v3
-; IR-DAZ-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; IR-DAZ-SDAG-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; IR-DAZ-SDAG-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
-; IR-DAZ-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; IR-DAZ-GISEL-LABEL: v_recip_sqrt_f32_ulp25_contract:
-; IR-DAZ-GISEL: ; %bb.0:
-; IR-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; IR-DAZ-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000
-; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
-; IR-DAZ-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; IR-DAZ-GISEL-NEXT: v_rsq_f32_e32 v1, v0
-; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, v0, v1
-; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; IR-DAZ-GISEL-NEXT: v_fma_f32 v3, -v1, v2, 0.5
-; IR-DAZ-GISEL-NEXT: v_fma_f32 v2, v2, v3, v2
-; IR-DAZ-GISEL-NEXT: v_fma_f32 v1, v1, v3, v1
-; IR-DAZ-GISEL-NEXT: v_fma_f32 v3, -v2, v2, v0
-; IR-DAZ-GISEL-NEXT: v_fma_f32 v1, v3, v1, v2
-; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; IR-DAZ-GISEL-NEXT: v_mov_b32_e32 v2, 0x260
-; IR-DAZ-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; IR-DAZ-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
-; IR-DAZ-GISEL-NEXT: v_rcp_f32_e32 v2, v1
-; IR-DAZ-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
-; IR-DAZ-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; IR-DAZ-GISEL-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; IR-DAZ-GISEL-NEXT: v_fma_f32 v2, v4, v2, v2
-; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v4, v3, v2
-; IR-DAZ-GISEL-NEXT: v_fma_f32 v5, -v1, v4, v3
-; IR-DAZ-GISEL-NEXT: v_fma_f32 v4, v5, v2, v4
-; IR-DAZ-GISEL-NEXT: v_fma_f32 v1, -v1, v4, v3
-; IR-DAZ-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; IR-DAZ-GISEL-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; IR-DAZ-GISEL-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
-; IR-DAZ-GISEL-NEXT: s_setpc_b64 s[30:31]
+; IR-DAZ-LABEL: v_recip_sqrt_f32_ulp25_contract:
+; IR-DAZ: ; %bb.0:
+; IR-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; IR-DAZ-NEXT: v_sqrt_f32_e32 v0, v0
+; IR-DAZ-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; IR-DAZ-NEXT: v_rcp_f32_e32 v2, v1
+; IR-DAZ-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; IR-DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; IR-DAZ-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; IR-DAZ-NEXT: v_fma_f32 v2, v4, v2, v2
+; IR-DAZ-NEXT: v_mul_f32_e32 v4, v3, v2
+; IR-DAZ-NEXT: v_fma_f32 v5, -v1, v4, v3
+; IR-DAZ-NEXT: v_fma_f32 v4, v5, v2, v4
+; IR-DAZ-NEXT: v_fma_f32 v1, -v1, v4, v3
+; IR-DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; IR-DAZ-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; IR-DAZ-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-DAZ-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract float @llvm.sqrt.f32(float %x), !fpmath !0
%fdiv = fdiv contract float 1.0, %sqrt, !fpmath !0
ret float %fdiv
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 9233f80..9e15225 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -7464,18 +7464,15 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-GISEL-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 1.0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, 2.0
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v4
+; SI-GISEL-NEXT: v_max_f32_e32 v2, 2.0, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 4.0
-; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v2, 4.0, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-GISEL-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
@@ -7639,27 +7636,24 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0
; SI-GISEL-NEXT: s_mov_b32 s10, 0
; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 1.0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 2.0
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3]
-; SI-GISEL-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v5, 4.0
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5]
-; SI-GISEL-NEXT: buffer_load_ushort v6, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
-; SI-GISEL-NEXT: buffer_load_ushort v7, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4
-; SI-GISEL-NEXT: v_add_f32_e32 v2, v4, v2
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v6
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v7
+; SI-GISEL-NEXT: v_add_f32_e32 v3, 2.0, v3
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5
+; SI-GISEL-NEXT: v_add_f32_e32 v4, 4.0, v4
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3
@@ -8712,12 +8706,10 @@ define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) #1 {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 2.0
-; SI-GISEL-NEXT: v_max_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_max_f32_e32 v0, 2.0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 4.0
-; SI-GISEL-NEXT: v_min_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_min_f32_e32 v0, 4.0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -8796,17 +8788,15 @@ define <2 x half> @v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum(<2 x half> %a)
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 2.0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 4.0
-; SI-GISEL-NEXT: v_max_f32_e32 v0, v0, v2
-; SI-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT: v_max_f32_e32 v0, 2.0, v0
+; SI-GISEL-NEXT: v_max_f32_e32 v1, 2.0, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_min_f32_e32 v0, v0, v3
-; SI-GISEL-NEXT: v_min_f32_e32 v1, v1, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v0, 4.0, v0
+; SI-GISEL-NEXT: v_min_f32_e32 v1, 4.0, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
index 64a9727..76da0aa 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
@@ -107,12 +107,10 @@ define amdgpu_kernel void @fneg_fabs_fmul_bf16(ptr addrspace(1) %out, bfloat %x,
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_and_b32 s3, s2, 0x7fff
-; CI-NEXT: s_lshl_b32 s3, s3, 16
-; CI-NEXT: v_mul_f32_e64 v0, -1.0, s3
+; CI-NEXT: s_lshl_b32 s3, s2, 16
; CI-NEXT: s_and_b32 s2, s2, 0xffff0000
-; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; CI-NEXT: v_mul_f32_e32 v0, s2, v0
+; CI-NEXT: v_mov_b32_e32 v0, s3
+; CI-NEXT: v_mul_f32_e64 v0, s2, -|v0|
; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -204,12 +202,10 @@ define amdgpu_kernel void @fneg_fabs_free_bf16(ptr addrspace(1) %out, i16 %in) {
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_and_b32 s2, s2, 0x7fff
-; CI-NEXT: s_lshl_b32 s2, s2, 16
-; CI-NEXT: v_mul_f32_e64 v0, -1.0, s2
-; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; CI-NEXT: s_bitset1_b32 s2, 15
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_short v[0:1], v2
; CI-NEXT: s_endpgm
;
@@ -279,12 +275,10 @@ define amdgpu_kernel void @fneg_fabs_bf16(ptr addrspace(1) %out, bfloat %in) {
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_and_b32 s2, s2, 0x7fff
-; CI-NEXT: s_lshl_b32 s2, s2, 16
-; CI-NEXT: v_mul_f32_e64 v0, -1.0, s2
-; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; CI-NEXT: s_bitset1_b32 s2, 15
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_short v[0:1], v2
; CI-NEXT: s_endpgm
;
@@ -345,43 +339,22 @@ define amdgpu_kernel void @fneg_fabs_bf16(ptr addrspace(1) %out, bfloat %in) {
}
define amdgpu_kernel void @v_fneg_fabs_bf16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
-; CI-LABEL: v_fneg_fabs_bf16:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; CI-NEXT: s_add_i32 s12, s12, s17
-; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v0, s2
-; CI-NEXT: v_mov_b32_e32 v1, s3
-; CI-NEXT: flat_load_ushort v2, v[0:1]
-; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; CI-NEXT: v_mul_f32_e64 v2, 1.0, |v2|
-; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; CI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
-; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; CI-NEXT: flat_store_short v[0:1], v2
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: v_fneg_fabs_bf16:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT: s_add_i32 s12, s12, s17
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: flat_load_ushort v2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_e32 v2, 0x8000, v2
-; VI-NEXT: flat_store_short v[0:1], v2
-; VI-NEXT: s_endpgm
+; CIVI-LABEL: v_fneg_fabs_bf16:
+; CIVI: ; %bb.0:
+; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CIVI-NEXT: s_waitcnt lgkmcnt(0)
+; CIVI-NEXT: v_mov_b32_e32 v0, s2
+; CIVI-NEXT: v_mov_b32_e32 v1, s3
+; CIVI-NEXT: flat_load_ushort v2, v[0:1]
+; CIVI-NEXT: v_mov_b32_e32 v0, s0
+; CIVI-NEXT: v_mov_b32_e32 v1, s1
+; CIVI-NEXT: s_waitcnt vmcnt(0)
+; CIVI-NEXT: v_or_b32_e32 v2, 0x8000, v2
+; CIVI-NEXT: flat_store_short v[0:1], v2
+; CIVI-NEXT: s_endpgm
;
; GFX9-LABEL: v_fneg_fabs_bf16:
; GFX9: ; %bb.0:
@@ -431,21 +404,13 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out,
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_and_b32 s3, s2, 0xffff0000
-; CI-NEXT: s_lshl_b32 s2, s2, 16
-; CI-NEXT: v_add_f32_e64 v0, s3, 2.0
-; CI-NEXT: v_add_f32_e64 v1, s2, 1.0
-; CI-NEXT: v_readfirstlane_b32 s2, v0
+; CI-NEXT: s_lshl_b32 s3, s2, 16
; CI-NEXT: s_and_b32 s2, s2, 0xffff0000
-; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; CI-NEXT: s_bitset0_b32 s2, 31
-; CI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v1
-; CI-NEXT: s_and_b32 s2, s2, 0xffff0000
-; CI-NEXT: s_xor_b32 s2, s2, 0x80000000
-; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; CI-NEXT: s_lshr_b32 s2, s2, 16
-; CI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; CI-NEXT: v_alignbit_b32 v2, s2, v0, 16
+; CI-NEXT: v_add_f32_e64 v1, s2, 2.0
+; CI-NEXT: v_add_f32_e64 v0, s3, 1.0
+; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; CI-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v0
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
@@ -566,15 +531,10 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_bc_src(ptr addrspace(1) %out, <2 x
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_and_b32 s3, s2, 0x7fff
-; CI-NEXT: s_and_b32 s2, s2, 0x7fff0000
-; CI-NEXT: v_mul_f32_e64 v0, -1.0, s2
-; CI-NEXT: s_lshl_b32 s2, s3, 16
-; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; CI-NEXT: v_mul_f32_e64 v1, -1.0, s2
-; CI-NEXT: v_alignbit_b32 v2, v0, v1, 16
+; CI-NEXT: s_or_b32 s2, s2, 0x80008000
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
@@ -629,27 +589,11 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat>
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_lshl_b32 s4, s2, 16
-; CI-NEXT: s_and_b32 s2, s2, 0xffff0000
-; CI-NEXT: v_mul_f32_e64 v2, 1.0, |s2|
-; CI-NEXT: s_and_b32 s2, s3, 0xffff0000
-; CI-NEXT: s_lshl_b32 s5, s3, 16
-; CI-NEXT: v_mul_f32_e64 v3, 1.0, |s2|
-; CI-NEXT: v_mul_f32_e64 v0, 1.0, |s4|
-; CI-NEXT: v_mul_f32_e64 v1, 1.0, |s5|
-; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; CI-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
-; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; CI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
-; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; CI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
-; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; CI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; CI-NEXT: v_alignbit_b32 v1, v3, v1, 16
-; CI-NEXT: v_alignbit_b32 v0, v2, v0, 16
+; CI-NEXT: s_or_b32 s3, s3, 0x80008000
+; CI-NEXT: s_or_b32 s2, s2, 0x80008000
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
@@ -860,21 +804,20 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2bf16(ptr addrspace(1) %out0,
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: v_mov_b32_e32 v2, s2
-; CI-NEXT: s_and_b32 s1, s4, 0x7fff
-; CI-NEXT: s_and_b32 s2, s4, 0x7fff0000
-; CI-NEXT: v_mul_f32_e64 v4, -1.0, s2
-; CI-NEXT: s_lshl_b32 s1, s1, 16
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
-; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; CI-NEXT: v_mul_f32_e64 v5, -1.0, s1
-; CI-NEXT: v_alignbit_b32 v4, v4, v5, 16
-; CI-NEXT: v_mov_b32_e32 v5, s0
+; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: s_or_b32 s2, s0, 0x8000
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: s_and_b32 s1, s4, 0x7fff0000
+; CI-NEXT: s_and_b32 s2, s2, 0xffff
+; CI-NEXT: s_or_b32 s1, s1, s2
+; CI-NEXT: s_bitset1_b32 s1, 31
+; CI-NEXT: v_mov_b32_e32 v4, s0
; CI-NEXT: v_mov_b32_e32 v3, s3
-; CI-NEXT: flat_store_dword v[0:1], v5
-; CI-NEXT: flat_store_dword v[2:3], v4
+; CI-NEXT: flat_store_dword v[0:1], v4
+; CI-NEXT: v_mov_b32_e32 v0, s1
+; CI-NEXT: flat_store_dword v[2:3], v0
; CI-NEXT: s_endpgm
;
; VI-LABEL: s_fneg_multi_use_fabs_v2bf16:
@@ -1086,5 +1029,3 @@ declare <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat>) #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CIVI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll
index d232693..98044a7 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll
@@ -14,11 +14,10 @@ define amdgpu_kernel void @s_fneg_bf16(ptr addrspace(1) %out, bfloat %in) #0 {
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_lshl_b32 s2, s2, 16
-; CI-NEXT: v_mul_f32_e64 v0, -1.0, s2
-; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; CI-NEXT: s_xor_b32 s2, s2, 0x8000
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_short v[0:1], v2
; CI-NEXT: s_endpgm
;
@@ -93,9 +92,7 @@ define amdgpu_kernel void @v_fneg_bf16(ptr addrspace(1) %out, ptr addrspace(1) %
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_ushort v2, v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; CI-NEXT: v_mul_f32_e32 v2, -1.0, v2
-; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; CI-NEXT: v_xor_b32_e32 v2, 0x8000, v2
; CI-NEXT: flat_store_short v[0:1], v2
; CI-NEXT: s_endpgm
;
@@ -170,11 +167,10 @@ define amdgpu_kernel void @s_fneg_free_bf16(ptr addrspace(1) %out, i16 %in) #0 {
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_lshl_b32 s2, s2, 16
-; CI-NEXT: v_mul_f32_e64 v0, -1.0, s2
-; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; CI-NEXT: s_xor_b32 s2, s2, 0x8000
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_short v[0:1], v2
; CI-NEXT: s_endpgm
;
@@ -248,9 +244,9 @@ define amdgpu_kernel void @v_fneg_fold_bf16(ptr addrspace(1) %out, ptr addrspace
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: v_xor_b32_e32 v3, 0x8000, v2
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; CI-NEXT: v_mul_f32_e32 v3, -1.0, v2
-; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_mul_f32_e32 v2, v3, v2
; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; CI-NEXT: flat_store_short v[0:1], v2
@@ -365,13 +361,13 @@ define amdgpu_kernel void @s_fneg_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s3, s2, 0xffff0000
-; CI-NEXT: s_lshl_b32 s2, s2, 16
-; CI-NEXT: v_mul_f32_e64 v0, -1.0, s3
-; CI-NEXT: v_mul_f32_e64 v1, -1.0, s2
-; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; CI-NEXT: v_alignbit_b32 v2, v0, v1, 16
+; CI-NEXT: s_xor_b32 s2, s2, 0x8000
+; CI-NEXT: s_and_b32 s2, s2, 0xffff
+; CI-NEXT: s_or_b32 s2, s2, s3
+; CI-NEXT: s_add_i32 s2, s2, 0x80000000
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
@@ -426,16 +422,16 @@ define amdgpu_kernel void @s_fneg_v2bf16_nonload(ptr addrspace(1) %out) #0 {
; CI-NEXT: ; def s2
; CI-NEXT: ;;#ASMEND
; CI-NEXT: s_and_b32 s3, s2, 0xffff0000
-; CI-NEXT: v_mul_f32_e64 v0, -1.0, s3
-; CI-NEXT: s_lshl_b32 s2, s2, 16
-; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; CI-NEXT: v_mul_f32_e64 v1, -1.0, s2
-; CI-NEXT: v_alignbit_b32 v2, v0, v1, 16
+; CI-NEXT: s_xor_b32 s2, s2, 0x8000
+; CI-NEXT: s_and_b32 s2, s2, 0xffff
+; CI-NEXT: s_or_b32 s2, s2, s3
+; CI-NEXT: s_add_i32 s2, s2, 0x80000000
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
@@ -501,13 +497,11 @@ define amdgpu_kernel void @v_fneg_v2bf16(ptr addrspace(1) %out, ptr addrspace(1)
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dword v2, v[0:1]
+; CI-NEXT: s_mov_b32 s0, 0xffff
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; CI-NEXT: v_mul_f32_e32 v3, -1.0, v3
-; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; CI-NEXT: v_mul_f32_e32 v2, -1.0, v2
-; CI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; CI-NEXT: v_xor_b32_e32 v3, 0x8000, v2
+; CI-NEXT: v_bfi_b32 v2, s0, v3, v2
+; CI-NEXT: v_add_i32_e32 v2, vcc, 0x80000000, v2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
@@ -570,13 +564,13 @@ define amdgpu_kernel void @fneg_free_v2bf16(ptr addrspace(1) %out, i32 %in) #0 {
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s3, s2, 0xffff0000
-; CI-NEXT: s_lshl_b32 s2, s2, 16
-; CI-NEXT: v_mul_f32_e64 v0, -1.0, s3
-; CI-NEXT: v_mul_f32_e64 v1, -1.0, s2
-; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; CI-NEXT: v_alignbit_b32 v2, v0, v1, 16
+; CI-NEXT: s_xor_b32 s2, s2, 0x8000
+; CI-NEXT: s_and_b32 s2, s2, 0xffff
+; CI-NEXT: s_or_b32 s2, s2, s3
+; CI-NEXT: s_add_i32 s2, s2, 0x80000000
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
@@ -637,16 +631,14 @@ define amdgpu_kernel void @v_fneg_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; CI-NEXT: v_mul_f32_e32 v4, -1.0, v3
-; CI-NEXT: v_mul_f32_e32 v5, -1.0, v2
-; CI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; CI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; CI-NEXT: v_mul_f32_e32 v3, v4, v3
-; CI-NEXT: v_mul_f32_e32 v2, v5, v2
-; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; CI-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; CI-NEXT: v_xor_b32_e32 v3, 0x8000, v2
+; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_mul_f32_e64 v2, -v2, v2
+; CI-NEXT: v_mul_f32_e32 v3, v3, v4
+; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; CI-NEXT: v_alignbit_b32 v2, v2, v3, 16
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
@@ -912,12 +904,9 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2bf16(ptr addrspace(1) %in) #
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_load_dword v0, v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
-; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; CI-NEXT: v_mul_f32_e32 v1, -1.0, v1
-; CI-NEXT: v_mul_f32_e32 v0, -1.0, v0
-; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; CI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; CI-NEXT: v_xor_b32_e32 v1, 0x8000, v1
; CI-NEXT: flat_store_short v[0:1], v0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: flat_store_short v[0:1], v1
diff --git a/llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll b/llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll
new file mode 100644
index 0000000..49204f8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll
@@ -0,0 +1,296 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+
+define amdgpu_kernel void @fptoui_f32_to_i16_to_f32(ptr addrspace(1) %out, float %x) {
+; GFX6-LABEL: fptoui_f32_to_i16_to_f32:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_trunc_f32_e64 v0, |s6|
+; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT: s_endpgm
+;
+; GFX9-LABEL: fptoui_f32_to_i16_to_f32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_trunc_f32_e64 v1, |s2|
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+entry:
+ %ui = fptoui float %x to i16
+ %fp = uitofp i16 %ui to float
+ store float %fp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @fptoui_f32_to_i32_to_f32(ptr addrspace(1) %out, float %x) {
+; GFX6-LABEL: fptoui_f32_to_i32_to_f32:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_trunc_f32_e64 v0, |s6|
+; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT: s_endpgm
+;
+; GFX9-LABEL: fptoui_f32_to_i32_to_f32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_trunc_f32_e64 v1, |s2|
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+entry:
+ %ui = fptoui float %x to i32
+ %fp = uitofp i32 %ui to float
+ store float %fp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @fptoui_f32_to_i64_to_f32(ptr addrspace(1) %out, float %x) {
+; GFX6-LABEL: fptoui_f32_to_i64_to_f32:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_trunc_f32_e64 v0, |s6|
+; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT: s_endpgm
+;
+; GFX9-LABEL: fptoui_f32_to_i64_to_f32:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_trunc_f32_e64 v1, |s2|
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+entry:
+ %ui = fptoui float %x to i64
+ %fp = uitofp i64 %ui to float
+ store float %fp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @fptoui_f16_to_i16_to_f16(ptr addrspace(1) %out, half %x) {
+; GFX6-LABEL: fptoui_f16_to_i16_to_f16:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX6-NEXT: s_endpgm
+;
+; GFX9-LABEL: fptoui_f16_to_i16_to_f16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_trunc_f16_e64 v1, |s2|
+; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+entry:
+ %ui = fptoui half %x to i16
+ %fp = uitofp i16 %ui to half
+ store half %fp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @fptoui_f16_to_i32_to_f16(ptr addrspace(1) %out, half %x) {
+; GFX6-LABEL: fptoui_f16_to_i32_to_f16:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e64 v0, |s0|
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT: v_trunc_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX6-NEXT: s_endpgm
+;
+; GFX9-LABEL: fptoui_f16_to_i32_to_f16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_trunc_f16_e64 v1, |s2|
+; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+entry:
+ %ui = fptoui half %x to i32
+ %fp = uitofp i32 %ui to half
+ store half %fp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @fptoui_f16_to_i64_to_f16(ptr addrspace(1) %out, half %x) {
+; GFX6-LABEL: fptoui_f16_to_i64_to_f16:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e64 v0, |s0|
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT: v_trunc_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX6-NEXT: s_endpgm
+;
+; GFX9-LABEL: fptoui_f16_to_i64_to_f16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_trunc_f16_e64 v1, |s2|
+; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+entry:
+ %ui = fptoui half %x to i64
+ %fp = uitofp i64 %ui to half
+ store half %fp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @fptoui_f64_to_i16_to_f64(ptr addrspace(1) %out, double %x) {
+; GFX6-LABEL: fptoui_f64_to_i16_to_f64:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_cvt_u32_f64_e32 v0, s[2:3]
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
+; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-NEXT: s_endpgm
+;
+; GFX9-LABEL: fptoui_f64_to_i16_to_f64:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_trunc_f64_e64 v[0:1], |s[2:3]|
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+entry:
+ %ui = fptoui double %x to i16
+ %fp = uitofp i16 %ui to double
+ store double %fp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @fptoui_f64_to_i32_to_f64(ptr addrspace(1) %out, double %x) {
+; GFX6-LABEL: fptoui_f64_to_i32_to_f64:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_cvt_u32_f64_e32 v0, s[2:3]
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
+; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-NEXT: s_endpgm
+;
+; GFX9-LABEL: fptoui_f64_to_i32_to_f64:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_trunc_f64_e64 v[0:1], |s[2:3]|
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+entry:
+ %ui = fptoui double %x to i32
+ %fp = uitofp i32 %ui to double
+ store double %fp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @fptoui_f64_to_i64_to_f64(ptr addrspace(1) %out, double %x) {
+; GFX6-LABEL: fptoui_f64_to_i64_to_f64:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT: s_mov_b32 s6, -1
+; GFX6-NEXT: s_mov_b32 s5, 0xfffff
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: v_not_b32_e32 v0, 31
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_bfe_u32 s7, s3, 0xb0014
+; GFX6-NEXT: s_addk_i32 s7, 0xfc01
+; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s7
+; GFX6-NEXT: s_and_b32 s8, s3, 0x80000000
+; GFX6-NEXT: s_andn2_b64 s[4:5], s[2:3], s[4:5]
+; GFX6-NEXT: s_cmp_lt_i32 s7, 0
+; GFX6-NEXT: s_cselect_b32 s4, 0, s4
+; GFX6-NEXT: s_cselect_b32 s5, s8, s5
+; GFX6-NEXT: s_cmp_gt_i32 s7, 51
+; GFX6-NEXT: s_cselect_b32 s3, s3, s5
+; GFX6-NEXT: s_cselect_b32 s2, s2, s4
+; GFX6-NEXT: v_ldexp_f64 v[0:1], s[2:3], v0
+; GFX6-NEXT: v_mov_b32_e32 v4, -1
+; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
+; GFX6-NEXT: v_mov_b32_e32 v5, 0x3fefffff
+; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX6-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 3
+; GFX6-NEXT: s_mov_b32 s4, 0
+; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3]
+; GFX6-NEXT: v_mov_b32_e32 v2, s2
+; GFX6-NEXT: s_mov_b32 s5, 0xc1f00000
+; GFX6-NEXT: v_mov_b32_e32 v3, s3
+; GFX6-NEXT: v_fma_f64 v[2:3], v[0:1], s[4:5], v[2:3]
+; GFX6-NEXT: v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX6-NEXT: v_cvt_u32_f64_e32 v2, v[2:3]
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s0
+; GFX6-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
+; GFX6-NEXT: v_cvt_f64_u32_e32 v[2:3], v2
+; GFX6-NEXT: s_mov_b32 s5, s1
+; GFX6-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32
+; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT: s_endpgm
+;
+; GFX9-LABEL: fptoui_f64_to_i64_to_f64:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_trunc_f64_e64 v[0:1], |s[2:3]|
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+entry:
+ %ui = fptoui double %x to i64
+ %fp = uitofp i64 %ui to double
+ store double %fp, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
index 67d0410..3324018 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
@@ -3,11 +3,11 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-SDAG %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
; SI-LABEL: is_private_vgpr:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
index 63333ed..355d002 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
@@ -3,11 +3,11 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-SDAG %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
; CIT-LABEL: is_local_vgpr:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index ee11b92..0c1448a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -44,23 +44,23 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
+; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_mov_b32_e32 v12, s16
+; GISEL-NEXT: v_mov_b32_e32 v16, s16
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[6:7]
+; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[6:7]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -834,24 +834,24 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v12, s2
+; GISEL-NEXT: v_mov_b32_e32 v16, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1349,24 +1349,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v12, s2
+; GISEL-NEXT: v_mov_b32_e32 v16, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1513,24 +1513,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v12, s2
+; GISEL-NEXT: v_mov_b32_e32 v16, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1677,24 +1677,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v12, s2
+; GISEL-NEXT: v_mov_b32_e32 v16, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1841,24 +1841,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v12, s2
+; GISEL-NEXT: v_mov_b32_e32 v16, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index af79c91..ac356fa 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -6011,8 +6011,7 @@ define half @v_exp_f16_fast(half %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 0x3dc5
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
@@ -6512,10 +6511,9 @@ define <2 x half> @v_exp_v2f16_fast(<2 x half> %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 0x3dc5
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -6709,12 +6707,11 @@ define <3 x half> @v_exp_v3f16_afn(<3 x half> %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 0x3dc5
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v3
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v3
-; SI-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index a99c199..d12ebe4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -6092,8 +6092,7 @@ define half @v_exp10_f16_fast(half %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 0x3dc5
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
@@ -6594,10 +6593,9 @@ define <2 x half> @v_exp10_v2f16_fast(<2 x half> %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 0x3dc5
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -6791,12 +6789,11 @@ define <3 x half> @v_exp10_v3f16_afn(<3 x half> %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 0x3dc5
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v3
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v3
-; SI-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
index 3f66c23..259ee0b 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
@@ -488,13 +488,11 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v1
+; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v1
+; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v0
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext half %src0 to float
@@ -582,15 +580,13 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi
; GISEL-CI-NEXT: s_mov_b32 s7, 0xf000
; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v0
; GISEL-CI-NEXT: buffer_store_short v0, off, s[4:7], 0
; GISEL-CI-NEXT: s_waitcnt vmcnt(0)
-; GISEL-CI-NEXT: v_max_f32_e32 v1, v2, v1
+; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext half %src0 to float
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index 21e6faf4..ba77552 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -313,13 +313,11 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v1
+; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v1
+; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext half %src0 to float
@@ -1009,28 +1007,26 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5
; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v4
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v5
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v2
+; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext <2 x half> %src0 to <2 x float>
@@ -1225,25 +1221,23 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
; GISEL-CI-NEXT: v_mac_f32_e32 v7, v1, v4
; GISEL-CI-NEXT: v_mac_f32_e32 v8, v2, v5
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v7
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v8
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v8
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2
-; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
-; GISEL-CI-NEXT: v_max_f32_e32 v2, v3, v2
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
+; GISEL-CI-NEXT: v_max_f32_e32 v2, 0, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v3
-; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v3
-; GISEL-CI-NEXT: v_min_f32_e32 v2, v2, v3
+; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
+; GISEL-CI-NEXT: v_min_f32_e32 v2, 1.0, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
@@ -1441,30 +1435,28 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s
; GISEL-CI-NEXT: v_mac_f32_e32 v11, v3, v7
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v8
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v9
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v10
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v4, v11
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v10
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v11
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2
-; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
-; GISEL-CI-NEXT: v_max_f32_e32 v3, v3, v2
-; GISEL-CI-NEXT: v_max_f32_e32 v2, v4, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0
+; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
+; GISEL-CI-NEXT: v_max_f32_e32 v2, 0, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v3, 0, v3
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v2
-; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v5
-; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v5
-; GISEL-CI-NEXT: v_min_f32_e32 v2, v3, v5
-; GISEL-CI-NEXT: v_min_f32_e32 v3, v4, v5
+; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0
+; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
+; GISEL-CI-NEXT: v_min_f32_e32 v2, 1.0, v2
+; GISEL-CI-NEXT: v_min_f32_e32 v3, 1.0, v3
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -1622,16 +1614,14 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v5
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v4
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v0
; GISEL-CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1
; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
@@ -1790,17 +1780,15 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v5
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v4
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0
; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
index 4f73e8e..c90b2c9 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
@@ -271,8 +271,7 @@ define half @v_maximumnum_f16_1.0(half %x) {
; GFX7-GISEL: ; %bb.0:
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 1.0
-; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: v_max_f32_e32 v0, 1.0, v0
; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
index 0af655df..4bb6538 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -2399,8 +2399,9 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX90A-NEXT: ; kill: def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $exec
; GFX90A-NEXT: .LBB9_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB9_2 Depth 2
@@ -2409,7 +2410,7 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
; GFX90A-NEXT: ; Parent Loop BB9_1 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GFX90A-NEXT: s_add_i32 s1, s1, -1
; GFX90A-NEXT: s_cmp_lg_u32 s1, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB9_2
@@ -2468,8 +2469,9 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
; GFX942-NEXT: v_accvgpr_mov_b32 a29, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a30, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0
-; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-NEXT: ; kill: def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $exec
; GFX942-NEXT: .LBB9_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Loop Header: Depth=1
; GFX942-NEXT: ; Child Loop BB9_2 Depth 2
@@ -2478,7 +2480,7 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
; GFX942-NEXT: ; Parent Loop BB9_1 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
; GFX942-NEXT: s_add_i32 s1, s1, -1
; GFX942-NEXT: s_cmp_lg_u32 s1, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB9_2
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
index 558006d..64e8b7b 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
@@ -271,8 +271,7 @@ define half @v_minimumnum_f16_1.0(half %x) {
; GFX7-GISEL: ; %bb.0:
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 1.0
-; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: v_min_f32_e32 v0, 1.0, v0
; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/regpressure_printer.mir b/llvm/test/CodeGen/AMDGPU/regpressure_printer.mir
index 8d5b5e4..b41aa08 100644
--- a/llvm/test/CodeGen/AMDGPU/regpressure_printer.mir
+++ b/llvm/test/CodeGen/AMDGPU/regpressure_printer.mir
@@ -510,14 +510,14 @@ body: |
; RPU-NEXT: 0 0 $sgpr0 = S_BUFFER_LOAD_DWORD_IMM $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0
; RPU-NEXT: 0 0
; RPU-NEXT: 0 1 undef %0.sub5:vreg_512 = V_MOV_B32_e32 5, implicit $exec
- ; RPU-NEXT: 0 0
- ; RPU-NEXT: 0 0 S_CMP_GT_U32 $sgpr0, 15, implicit-def $scc
- ; RPU-NEXT: 0 0
- ; RPU-NEXT: 0 0 S_CBRANCH_SCC1 %bb.2, implicit $scc
- ; RPU-NEXT: 0 0
- ; RPU-NEXT: 0 0 S_BRANCH %bb.1
- ; RPU-NEXT: 0 0
- ; RPU-NEXT: Live-out:
+ ; RPU-NEXT: 0 1
+ ; RPU-NEXT: 0 1 S_CMP_GT_U32 $sgpr0, 15, implicit-def $scc
+ ; RPU-NEXT: 0 1
+ ; RPU-NEXT: 0 1 S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; RPU-NEXT: 0 1
+ ; RPU-NEXT: 0 1 S_BRANCH %bb.1
+ ; RPU-NEXT: 0 1
+ ; RPU-NEXT: Live-out: %0:0000000000000C00
; RPU-NEXT: Live-thr:
; RPU-NEXT: 0 0
; RPU-NEXT: bb.1:
@@ -571,8 +571,6 @@ body: |
; RPD-NEXT: 0 1 S_BRANCH %bb.1
; RPD-NEXT: 0 1
; RPD-NEXT: Live-out: %0:0000000000000C00
- ; RPD-NEXT: mis LIS:
- ; RPD-NEXT: %0:L0000000000000C00 isn't found in LIS reported set
; RPD-NEXT: Live-thr:
; RPD-NEXT: 0 0
; RPD-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
new file mode 100644
index 0000000..f53aaaa
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -0,0 +1,625 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s
+
+declare i32 @llvm.ctpop.i32(i32)
+declare i64 @llvm.ctpop.i64(i64)
+declare i32 @llvm.amdgcn.s.quadmask.i32(i32)
+declare i64 @llvm.amdgcn.s.quadmask.i64(i64)
+
+define amdgpu_ps i32 @shl32(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: shl32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_lshl_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = shl i32 %val0, %val1
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @shl64(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: shl64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = shl i64 %val0, %val1
+ %cmp = icmp ne i64 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @lshr32(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: lshr32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_lshr_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = lshr i32 %val0, %val1
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @lshr64(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: lshr64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = lshr i64 %val0, %val1
+ %cmp = icmp ne i64 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @ashr32(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: ashr32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_ashr_i32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = ashr i32 %val0, %val1
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @ashr64(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: ashr64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_ashr_i64 s[0:1], s[0:1], s2
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = ashr i64 %val0, %val1
+ %cmp = icmp ne i64 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @abs32(i32 inreg %val0) {
+; CHECK-LABEL: abs32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_abs_i32 s0, s0
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %neg = sub i32 0, %val0
+ %cond = icmp sgt i32 %val0, %neg
+ %result = select i1 %cond, i32 %val0, i32 %neg
+ call void asm "; use $0", "s"(i32 %result)
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @and32(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: and32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_and_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = and i32 %val0, %val1
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @and64(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: and64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = and i64 %val0, %val1
+ %cmp = icmp ne i64 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @or32(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: or32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_or_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = or i32 %val0, %val1
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @or64(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: or64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = or i64 %val0, %val1
+ %cmp = icmp ne i64 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @xor32(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: xor32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_xor_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = xor i32 %val0, %val1
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @xor64(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: xor64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = xor i64 %val0, %val1
+ %cmp = icmp ne i64 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @nand32(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: nand32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_nand_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = and i32 %val0, %val1
+ %result2 = xor i32 %result, -1
+ call void asm "; use $0", "s"(i32 %result2)
+ %cmp = icmp ne i32 %result2, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @nand64(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: nand64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = and i64 %val0, %val1
+ %result2 = xor i64 %result, -1
+ call void asm "; use $0", "s"(i64 %result2)
+ %cmp = icmp ne i64 %result2, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @nor32(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: nor32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_nor_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = or i32 %val0, %val1
+ %result2 = xor i32 %result, -1
+ call void asm "; use $0", "s"(i32 %result2)
+ %cmp = icmp ne i32 %result2, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @nor64(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: nor64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = or i64 %val0, %val1
+ %result2 = xor i64 %result, -1
+ call void asm "; use $0", "s"(i64 %result2)
+ %cmp = icmp ne i64 %result2, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @xnor32(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: xnor32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_xnor_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = xor i32 %val0, %val1
+ %result2 = xor i32 %result, -1
+ call void asm "; use $0", "s"(i32 %result2)
+ %cmp = icmp ne i32 %result2, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @xnor64(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: xnor64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_xnor_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = xor i64 %val0, %val1
+ %result2 = xor i64 %result, -1
+ call void asm "; use $0", "s"(i64 %result2)
+ %cmp = icmp ne i64 %result2, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @andn232(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: andn232:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_andn2_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %nval1 = xor i32 %val1, -1
+ %result = and i32 %val0, %nval1
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @nandn264(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: nandn264:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %nval1 = xor i64 %val1, -1
+ %result = and i64 %val0, %nval1
+ %cmp = icmp ne i64 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @orn232(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: orn232:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_orn2_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %nval1 = xor i32 %val1, -1
+ %result = or i32 %val0, %nval1
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @orn264(i64 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: orn264:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_orn2_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %nval1 = xor i64 %val1, -1
+ %result = or i64 %val0, %nval1
+ %cmp = icmp ne i64 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @bfe_i32(i32 inreg %val0) {
+; CHECK-LABEL: bfe_i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_bfe_i32 s0, s0, 0x80010
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %shl = shl i32 %val0, 8
+ %result = ashr i32 %shl, 24
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @bfe_i64(i64 inreg %val0) {
+; CHECK-LABEL: bfe_i64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_bfe_i64 s[2:3], s[0:1], 0x80000
+; CHECK-NEXT: s_and_b32 s0, s0, 0xff
+; CHECK-NEXT: s_mov_b32 s1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ; return to shader part epilog
+ %shl = shl i64 %val0, 56
+ %result = ashr i64 %shl, 56
+ call void asm "; use $0", "s"(i64 %result)
+ %cmp = icmp ne i64 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @bfe_u32(i32 inreg %val0) {
+; CHECK-LABEL: bfe_u32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_bfe_u32 s0, s0, 0x80010
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %shl = shl i32 %val0, 8
+ %result = lshr i32 %shl, 24
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @bfe_u64(i64 inreg %val0) {
+; CHECK-LABEL: bfe_u64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_and_b32 s0, s0, 0xff
+; CHECK-NEXT: s_mov_b32 s1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %shl = shl i64 %val0, 56
+ %result = lshr i64 %shl, 56
+ call void asm "; use $0", "s"(i64 %result)
+ %cmp = icmp ne i64 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @bcnt032(i32 inreg %val0) {
+; CHECK-LABEL: bcnt032:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0
+; CHECK-NEXT: s_sub_i32 s0, 32, s0
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
+ %result2 = sub i32 32, %result
+ call void asm "; use $0", "s"(i32 %result2)
+ %cmp = icmp ne i32 %result2, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @bcnt064(i64 inreg %val0) {
+; CHECK-LABEL: bcnt064:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; CHECK-NEXT: s_sub_u32 s0, 64, s0
+; CHECK-NEXT: s_subb_u32 s1, 0, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone
+ %result2 = sub i64 64, %result
+ call void asm "; use $0", "s"(i64 %result2)
+ %cmp = icmp ne i64 %result2, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @bcnt132(i32 inreg %val0) {
+; CHECK-LABEL: bcnt132:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
+ call void asm "; use $0", "s"(i32 %result)
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @bcnt164(i64 inreg %val0) {
+; CHECK-LABEL: bcnt164:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; CHECK-NEXT: s_mov_b32 s1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone
+ call void asm "; use $0", "s"(i64 %result)
+ %cmp = icmp ne i64 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @quadmask32(i32 inreg %val0) {
+; CHECK-LABEL: quadmask32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_quadmask_b32 s0, s0
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = call i32 @llvm.amdgcn.s.quadmask.i32(i32 %val0) nounwind readnone
+ call void asm "; use $0", "s"(i32 %result)
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @quadmask64(i64 inreg %val0) {
+; CHECK-LABEL: quadmask64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_quadmask_b64 s[0:1], s[0:1]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = call i64 @llvm.amdgcn.s.quadmask.i64(i64 %val0) nounwind readnone
+ call void asm "; use $0", "s"(i64 %result)
+ %cmp = icmp ne i64 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @not32(i32 inreg %val0) {
+; CHECK-LABEL: not32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_not_b32 s0, s0
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = xor i32 %val0, -1
+ call void asm "; use $0", "s"(i32 %result)
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @not64(i64 inreg %val0) {
+; CHECK-LABEL: not64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_not_b64 s[0:1], s[0:1]
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = xor i64 %val0, -1
+ call void asm "; use $0", "s"(i64 %result)
+ %cmp = icmp ne i64 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-select.ll b/llvm/test/CodeGen/AMDGPU/uniform-select.ll
index f001bf0..b52913f 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-select.ll
@@ -20,34 +20,34 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) {
; GFX90A-NEXT: s_cmp_eq_u32 s1, 1
; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0
; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec
-; GFX90A-NEXT: s_cselect_b32 s7, s4, s3
+; GFX90A-NEXT: s_cselect_b32 s7, s3, s2
; GFX90A-NEXT: s_cmp_eq_u32 s1, 2
; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0
; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec
-; GFX90A-NEXT: s_cselect_b32 s7, s5, s7
+; GFX90A-NEXT: s_cselect_b32 s7, s4, s7
; GFX90A-NEXT: s_cmp_eq_u32 s1, 3
; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0
; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec
-; GFX90A-NEXT: s_cselect_b32 s7, s6, s7
+; GFX90A-NEXT: s_cselect_b32 s7, s5, s7
; GFX90A-NEXT: s_or_b32 s7, s7, s0
; GFX90A-NEXT: s_cmp_eq_u32 s1, 1
; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0
; GFX90A-NEXT: s_and_b64 s[10:11], s[8:9], exec
-; GFX90A-NEXT: s_cselect_b32 s4, s7, s4
+; GFX90A-NEXT: s_cselect_b32 s3, s7, s3
; GFX90A-NEXT: s_cmp_eq_u32 s1, 3
; GFX90A-NEXT: s_cselect_b64 s[10:11], -1, 0
; GFX90A-NEXT: s_and_b64 s[12:13], s[10:11], exec
-; GFX90A-NEXT: s_cselect_b32 s6, s7, s6
+; GFX90A-NEXT: s_cselect_b32 s5, s7, s5
; GFX90A-NEXT: s_cmp_eq_u32 s1, 2
; GFX90A-NEXT: s_cselect_b64 s[12:13], -1, 0
; GFX90A-NEXT: s_and_b64 s[14:15], s[12:13], exec
-; GFX90A-NEXT: s_cselect_b32 s5, s7, s5
+; GFX90A-NEXT: s_cselect_b32 s4, s7, s4
; GFX90A-NEXT: s_cmp_eq_u32 s1, 0
-; GFX90A-NEXT: s_cselect_b32 s3, s7, s3
+; GFX90A-NEXT: s_cselect_b32 s2, s7, s2
; GFX90A-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9]
; GFX90A-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec
-; GFX90A-NEXT: s_cselect_b32 s2, 0, s2
+; GFX90A-NEXT: s_cselect_b32 s6, 0, s6
; GFX90A-NEXT: s_mov_b64 vcc, vcc
; GFX90A-NEXT: s_cbranch_vccnz .LBB0_1
; GFX90A-NEXT: ; %bb.2: ; %DummyReturnBlock
@@ -68,34 +68,34 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) {
; GFX942-NEXT: s_cmp_eq_u32 s1, 1
; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0
; GFX942-NEXT: s_and_b64 s[8:9], s[8:9], exec
-; GFX942-NEXT: s_cselect_b32 s7, s4, s3
+; GFX942-NEXT: s_cselect_b32 s7, s3, s2
; GFX942-NEXT: s_cmp_eq_u32 s1, 2
; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0
; GFX942-NEXT: s_and_b64 s[8:9], s[8:9], exec
-; GFX942-NEXT: s_cselect_b32 s7, s5, s7
+; GFX942-NEXT: s_cselect_b32 s7, s4, s7
; GFX942-NEXT: s_cmp_eq_u32 s1, 3
; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0
; GFX942-NEXT: s_and_b64 s[8:9], s[8:9], exec
-; GFX942-NEXT: s_cselect_b32 s7, s6, s7
+; GFX942-NEXT: s_cselect_b32 s7, s5, s7
; GFX942-NEXT: s_or_b32 s7, s7, s0
; GFX942-NEXT: s_cmp_eq_u32 s1, 1
; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0
; GFX942-NEXT: s_and_b64 s[10:11], s[8:9], exec
-; GFX942-NEXT: s_cselect_b32 s4, s7, s4
+; GFX942-NEXT: s_cselect_b32 s3, s7, s3
; GFX942-NEXT: s_cmp_eq_u32 s1, 3
; GFX942-NEXT: s_cselect_b64 s[10:11], -1, 0
; GFX942-NEXT: s_and_b64 s[12:13], s[10:11], exec
-; GFX942-NEXT: s_cselect_b32 s6, s7, s6
+; GFX942-NEXT: s_cselect_b32 s5, s7, s5
; GFX942-NEXT: s_cmp_eq_u32 s1, 2
; GFX942-NEXT: s_cselect_b64 s[12:13], -1, 0
; GFX942-NEXT: s_and_b64 s[14:15], s[12:13], exec
-; GFX942-NEXT: s_cselect_b32 s5, s7, s5
+; GFX942-NEXT: s_cselect_b32 s4, s7, s4
; GFX942-NEXT: s_cmp_eq_u32 s1, 0
-; GFX942-NEXT: s_cselect_b32 s3, s7, s3
+; GFX942-NEXT: s_cselect_b32 s2, s7, s2
; GFX942-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9]
; GFX942-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
; GFX942-NEXT: s_and_b64 s[8:9], s[8:9], exec
-; GFX942-NEXT: s_cselect_b32 s2, 0, s2
+; GFX942-NEXT: s_cselect_b32 s6, 0, s6
; GFX942-NEXT: s_mov_b64 vcc, vcc
; GFX942-NEXT: s_cbranch_vccnz .LBB0_1
; GFX942-NEXT: ; %bb.2: ; %DummyReturnBlock
@@ -117,34 +117,34 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) {
; GFX1030-NEXT: s_cmp_eq_u32 s1, 1
; GFX1030-NEXT: s_cselect_b32 s7, -1, 0
; GFX1030-NEXT: s_and_b32 s7, s7, exec_lo
-; GFX1030-NEXT: s_cselect_b32 s7, s4, s3
+; GFX1030-NEXT: s_cselect_b32 s7, s3, s2
; GFX1030-NEXT: s_cmp_eq_u32 s1, 2
; GFX1030-NEXT: s_cselect_b32 s8, -1, 0
; GFX1030-NEXT: s_and_b32 s8, s8, exec_lo
-; GFX1030-NEXT: s_cselect_b32 s7, s5, s7
+; GFX1030-NEXT: s_cselect_b32 s7, s4, s7
; GFX1030-NEXT: s_cmp_eq_u32 s1, 3
; GFX1030-NEXT: s_cselect_b32 s8, -1, 0
; GFX1030-NEXT: s_and_b32 s8, s8, exec_lo
-; GFX1030-NEXT: s_cselect_b32 s7, s6, s7
+; GFX1030-NEXT: s_cselect_b32 s7, s5, s7
; GFX1030-NEXT: s_or_b32 s7, s7, s0
; GFX1030-NEXT: s_cmp_eq_u32 s1, 1
; GFX1030-NEXT: s_cselect_b32 s8, -1, 0
; GFX1030-NEXT: s_and_b32 s9, s8, exec_lo
-; GFX1030-NEXT: s_cselect_b32 s4, s7, s4
+; GFX1030-NEXT: s_cselect_b32 s3, s7, s3
; GFX1030-NEXT: s_cmp_eq_u32 s1, 3
; GFX1030-NEXT: s_cselect_b32 s9, -1, 0
; GFX1030-NEXT: s_and_b32 s10, s9, exec_lo
-; GFX1030-NEXT: s_cselect_b32 s6, s7, s6
+; GFX1030-NEXT: s_cselect_b32 s5, s7, s5
; GFX1030-NEXT: s_cmp_eq_u32 s1, 2
; GFX1030-NEXT: s_cselect_b32 s10, -1, 0
; GFX1030-NEXT: s_and_b32 s11, s10, exec_lo
-; GFX1030-NEXT: s_cselect_b32 s5, s7, s5
+; GFX1030-NEXT: s_cselect_b32 s4, s7, s4
; GFX1030-NEXT: s_cmp_eq_u32 s1, 0
-; GFX1030-NEXT: s_cselect_b32 s3, s7, s3
+; GFX1030-NEXT: s_cselect_b32 s2, s7, s2
; GFX1030-NEXT: s_or_b32 s7, s10, s8
; GFX1030-NEXT: s_or_b32 s7, s9, s7
; GFX1030-NEXT: s_and_b32 s7, s7, exec_lo
-; GFX1030-NEXT: s_cselect_b32 s2, 0, s2
+; GFX1030-NEXT: s_cselect_b32 s6, 0, s6
; GFX1030-NEXT: s_cbranch_vccnz .LBB0_1
; GFX1030-NEXT: ; %bb.2: ; %DummyReturnBlock
; GFX1030-NEXT: s_endpgm
@@ -166,38 +166,38 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) {
; GFX1100-NEXT: s_cselect_b32 s7, -1, 0
; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1100-NEXT: s_and_b32 s7, s7, exec_lo
-; GFX1100-NEXT: s_cselect_b32 s7, s4, s3
+; GFX1100-NEXT: s_cselect_b32 s7, s3, s2
; GFX1100-NEXT: s_cmp_eq_u32 s1, 2
; GFX1100-NEXT: s_cselect_b32 s8, -1, 0
; GFX1100-NEXT: s_and_b32 s8, s8, exec_lo
-; GFX1100-NEXT: s_cselect_b32 s7, s5, s7
+; GFX1100-NEXT: s_cselect_b32 s7, s4, s7
; GFX1100-NEXT: s_cmp_eq_u32 s1, 3
; GFX1100-NEXT: s_cselect_b32 s8, -1, 0
; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1100-NEXT: s_and_b32 s8, s8, exec_lo
-; GFX1100-NEXT: s_cselect_b32 s7, s6, s7
+; GFX1100-NEXT: s_cselect_b32 s7, s5, s7
; GFX1100-NEXT: s_or_b32 s7, s7, s0
; GFX1100-NEXT: s_cmp_eq_u32 s1, 1
; GFX1100-NEXT: s_cselect_b32 s8, -1, 0
; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1100-NEXT: s_and_b32 s9, s8, exec_lo
-; GFX1100-NEXT: s_cselect_b32 s4, s7, s4
+; GFX1100-NEXT: s_cselect_b32 s3, s7, s3
; GFX1100-NEXT: s_cmp_eq_u32 s1, 3
; GFX1100-NEXT: s_cselect_b32 s9, -1, 0
; GFX1100-NEXT: s_and_b32 s10, s9, exec_lo
-; GFX1100-NEXT: s_cselect_b32 s6, s7, s6
+; GFX1100-NEXT: s_cselect_b32 s5, s7, s5
; GFX1100-NEXT: s_cmp_eq_u32 s1, 2
; GFX1100-NEXT: s_cselect_b32 s10, -1, 0
; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX1100-NEXT: s_and_b32 s11, s10, exec_lo
-; GFX1100-NEXT: s_cselect_b32 s5, s7, s5
+; GFX1100-NEXT: s_cselect_b32 s4, s7, s4
; GFX1100-NEXT: s_cmp_eq_u32 s1, 0
-; GFX1100-NEXT: s_cselect_b32 s3, s7, s3
+; GFX1100-NEXT: s_cselect_b32 s2, s7, s2
; GFX1100-NEXT: s_or_b32 s7, s10, s8
; GFX1100-NEXT: s_or_b32 s7, s9, s7
; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100-NEXT: s_and_b32 s7, s7, exec_lo
-; GFX1100-NEXT: s_cselect_b32 s2, 0, s2
+; GFX1100-NEXT: s_cselect_b32 s6, 0, s6
; GFX1100-NEXT: s_cbranch_vccnz .LBB0_1
; GFX1100-NEXT: ; %bb.2: ; %DummyReturnBlock
; GFX1100-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
index 8b467eb..75ae76f 100644
--- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
+++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
@@ -1,7 +1,7 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
# RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GFX950 %s
# RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GFX942 %s
-# RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GFX90a %s
+# RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GFX90A %s
---
name: test_pk_mul_unpacking_f32
@@ -57,26 +57,26 @@ body: |
; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
; GFX942-NEXT: S_ENDPGM 0
;
- ; GFX90a-LABEL: name: test_pk_mul_unpacking_f32
- ; GFX90a: liveins: $sgpr4_sgpr5
- ; GFX90a-NEXT: {{ $}}
- ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
- ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90a-NEXT: S_WAITCNT 49279
- ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
- ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
- ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
- ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
- ; GFX90a-NEXT: S_WAITCNT 49279
- ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
- ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
- ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
- ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
- ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
- ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
- ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX90a-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: test_pk_mul_unpacking_f32
+ ; GFX90A: liveins: $sgpr4_sgpr5
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: S_WAITCNT 49279
+ ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90A-NEXT: S_WAITCNT 49279
+ ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
S_WAITCNT 49279
@@ -150,26 +150,26 @@ body: |
; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
; GFX942-NEXT: S_ENDPGM 0
;
- ; GFX90a-LABEL: name: test_op_sel_selection_unpacking_f32
- ; GFX90a: liveins: $sgpr4_sgpr5
- ; GFX90a-NEXT: {{ $}}
- ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
- ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90a-NEXT: S_WAITCNT 49279
- ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
- ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
- ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
- ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
- ; GFX90a-NEXT: S_WAITCNT 49279
- ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
- ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
- ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
- ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
- ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
- ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
- ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX90a-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: test_op_sel_selection_unpacking_f32
+ ; GFX90A: liveins: $sgpr4_sgpr5
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: S_WAITCNT 49279
+ ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90A-NEXT: S_WAITCNT 49279
+ ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
S_WAITCNT 49279
@@ -243,26 +243,26 @@ body: |
; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
; GFX942-NEXT: S_ENDPGM 0
;
- ; GFX90a-LABEL: name: test_op_sel_hi_selection_unpacking_f32
- ; GFX90a: liveins: $sgpr4_sgpr5
- ; GFX90a-NEXT: {{ $}}
- ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
- ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90a-NEXT: S_WAITCNT 49279
- ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
- ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
- ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
- ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
- ; GFX90a-NEXT: S_WAITCNT 49279
- ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
- ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
- ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
- ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
- ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
- ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
- ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX90a-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: test_op_sel_hi_selection_unpacking_f32
+ ; GFX90A: liveins: $sgpr4_sgpr5
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: S_WAITCNT 49279
+ ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90A-NEXT: S_WAITCNT 49279
+ ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
S_WAITCNT 49279
@@ -370,41 +370,41 @@ body: |
; GFX942-NEXT: renamable $vgpr10_vgpr11 = nofpexcept V_PK_ADD_F32 8, killed $sgpr10_sgpr11, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
; GFX942-NEXT: S_ENDPGM 0
;
- ; GFX90a-LABEL: name: test_pk_add_unpacking_f32
- ; GFX90a: liveins: $sgpr4_sgpr5
- ; GFX90a-NEXT: {{ $}}
- ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
- ; GFX90a-NEXT: renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec
- ; GFX90a-NEXT: renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec
- ; GFX90a-NEXT: renamable $vgpr2 = V_MOV_B32_e32 4, implicit $exec
- ; GFX90a-NEXT: renamable $vgpr3 = V_MOV_B32_e32 3, implicit $exec
- ; GFX90a-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM killed renamable $sgpr40_sgpr41, 0, 0
- ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
- ; GFX90a-NEXT: renamable $vgpr16 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90a-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GFX90a-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GFX90a-NEXT: $agpr29 = V_ACCVGPR_WRITE_B32_e64 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GFX90a-NEXT: $agpr28 = V_ACCVGPR_WRITE_B32_e64 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GFX90a-NEXT: $agpr27 = V_ACCVGPR_WRITE_B32_e64 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GFX90a-NEXT: $agpr26 = V_ACCVGPR_WRITE_B32_e64 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GFX90a-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GFX90a-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GFX90a-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GFX90a-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GFX90a-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GFX90a-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GFX90a-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GFX90a-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GFX90a-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GFX90a-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec
- ; GFX90a-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr18, implicit $exec, implicit $exec
- ; GFX90a-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr19, implicit $exec, implicit $exec
- ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_BF8_BF8_e64 killed $vgpr4_vgpr5, killed $vgpr2_vgpr3, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 1, 2, 3, implicit $mode, implicit $exec, implicit $mode, implicit $exec
- ; GFX90a-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_ADD_F32 8, killed $sgpr2_sgpr3, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX90a-NEXT: renamable $vgpr6_vgpr7 = nofpexcept V_PK_ADD_F32 8, killed $sgpr6_sgpr7, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX90a-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_ADD_F32 8, killed $sgpr4_sgpr5, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX90a-NEXT: renamable $vgpr10_vgpr11 = nofpexcept V_PK_ADD_F32 8, killed $sgpr10_sgpr11, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX90a-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: test_pk_add_unpacking_f32
+ ; GFX90A: liveins: $sgpr4_sgpr5
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX90A-NEXT: renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr2 = V_MOV_B32_e32 4, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr3 = V_MOV_B32_e32 3, implicit $exec
+ ; GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90A-NEXT: renamable $vgpr16 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90A-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90A-NEXT: $agpr29 = V_ACCVGPR_WRITE_B32_e64 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90A-NEXT: $agpr28 = V_ACCVGPR_WRITE_B32_e64 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90A-NEXT: $agpr27 = V_ACCVGPR_WRITE_B32_e64 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90A-NEXT: $agpr26 = V_ACCVGPR_WRITE_B32_e64 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90A-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90A-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90A-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90A-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90A-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90A-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90A-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90A-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90A-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90A-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec
+ ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr18, implicit $exec, implicit $exec
+ ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr19, implicit $exec, implicit $exec
+ ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_BF8_BF8_e64 killed $vgpr4_vgpr5, killed $vgpr2_vgpr3, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 1, 2, 3, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_ADD_F32 8, killed $sgpr2_sgpr3, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = nofpexcept V_PK_ADD_F32 8, killed $sgpr6_sgpr7, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_ADD_F32 8, killed $sgpr4_sgpr5, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = nofpexcept V_PK_ADD_F32 8, killed $sgpr10_sgpr11, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec
renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec
@@ -438,7 +438,6 @@ body: |
renamable $vgpr10_vgpr11 = nofpexcept V_PK_ADD_F32 8, killed $sgpr10_sgpr11, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
S_ENDPGM 0
-
...
---
name: test_pk_fma_unpacking_f32
@@ -490,24 +489,24 @@ body: |
; GFX942-NEXT: $vgpr17 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
; GFX942-NEXT: S_ENDPGM 0
;
- ; GFX90a-LABEL: name: test_pk_fma_unpacking_f32
- ; GFX90a: liveins: $sgpr4_sgpr5
- ; GFX90a-NEXT: {{ $}}
- ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
- ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
- ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
- ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
- ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
- ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
- ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
- ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
- ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
- ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
- ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
- ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX90a-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: test_pk_fma_unpacking_f32
+ ; GFX90A: liveins: $sgpr4_sgpr5
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
@@ -577,25 +576,25 @@ body: |
; GFX942-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
; GFX942-NEXT: S_ENDPGM 0
;
- ; GFX90a-LABEL: name: test_unpacking_does_not_introduce_rw_dependency
- ; GFX90a: liveins: $sgpr4_sgpr5
- ; GFX90a-NEXT: {{ $}}
- ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
- ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
- ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
- ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
- ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
- ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
- ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
- ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
- ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
- ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
- ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
- ; GFX90a-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_MUL_F32 8, $sgpr30_sgpr31, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX90a-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX90a-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: test_unpacking_does_not_introduce_rw_dependency
+ ; GFX90A: liveins: $sgpr4_sgpr5
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_MUL_F32 8, $sgpr30_sgpr31, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
@@ -670,28 +669,28 @@ body: |
; GFX942-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec
; GFX942-NEXT: S_ENDPGM 0
;
- ; GFX90a-LABEL: name: test_opcodes_not_supported_for_unpacking_are_skipped
- ; GFX90a: liveins: $sgpr4_sgpr5
- ; GFX90a-NEXT: {{ $}}
- ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
- ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
- ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
- ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
- ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
- ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
- ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
- ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
- ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
- ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
- ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
- ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec
- ; GFX90a-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec
- ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
- ; GFX90a-NEXT: $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
- ; GFX90a-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec
- ; GFX90a-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: test_opcodes_not_supported_for_unpacking_are_skipped
+ ; GFX90A: liveins: $sgpr4_sgpr5
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec
+ ; GFX90A-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX90A-NEXT: $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX90A-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX90A-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
@@ -705,14 +704,11 @@ body: |
early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
$vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
$vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
-
$vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec
$vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec
-
$vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
$vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
$vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec
-
S_ENDPGM 0
...
@@ -770,26 +766,26 @@ body: |
; GFX942-NEXT: $vgpr17 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, killed $vgpr7, 0, 0, implicit $mode, implicit $exec
; GFX942-NEXT: S_ENDPGM 0
;
- ; GFX90a-LABEL: name: test_opsel_register_is_correctly_marked_as_killed
- ; GFX90a: liveins: $sgpr4_sgpr5
- ; GFX90a-NEXT: {{ $}}
- ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
- ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
- ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
- ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
- ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
- ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
- ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
- ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
- ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
- ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec
- ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec
- ; GFX90a-NEXT: $vgpr6 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
- ; GFX90a-NEXT: $vgpr7 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
- ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX90a-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: test_opsel_register_is_correctly_marked_as_killed
+ ; GFX90A: liveins: $sgpr4_sgpr5
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec
+ ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec
+ ; GFX90A-NEXT: $vgpr6 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX90A-NEXT: $vgpr7 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
@@ -861,26 +857,26 @@ body: |
; GFX942-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
; GFX942-NEXT: S_ENDPGM 0
;
- ; GFX90a-LABEL: name: test_inst_dependent_on_mfma_are_not_unpacked
- ; GFX90a: liveins: $sgpr4_sgpr5
- ; GFX90a-NEXT: {{ $}}
- ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
- ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
- ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
- ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
- ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
- ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
- ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
- ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
- ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
- ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX90a-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
- ; GFX90a-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
- ; GFX90a-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec
- ; GFX90a-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
- ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX90a-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: test_inst_dependent_on_mfma_are_not_unpacked
+ ; GFX90A: liveins: $sgpr4_sgpr5
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
+ ; GFX90A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
+ ; GFX90A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec
+ ; GFX90A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
@@ -950,25 +946,25 @@ body: |
; GFX942-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
; GFX942-NEXT: S_ENDPGM 0
;
- ; GFX90a-LABEL: name: test_mfma_def_using_instr_blocks_unpacking
- ; GFX90a: liveins: $sgpr4_sgpr5
- ; GFX90a-NEXT: {{ $}}
- ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
- ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
- ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
- ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
- ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
- ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
- ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
- ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
- ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
- ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec
- ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec
- ; GFX90a-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
- ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX90a-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: test_mfma_def_using_instr_blocks_unpacking
+ ; GFX90A: liveins: $sgpr4_sgpr5
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec
+ ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec
+ ; GFX90A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
@@ -1041,26 +1037,26 @@ body: |
; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, 1065353216, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
; GFX942-NEXT: S_ENDPGM 0
;
- ; GFX90a-LABEL: name: test_unpacking_with_imm_input
- ; GFX90a: liveins: $sgpr4_sgpr5
- ; GFX90a-NEXT: {{ $}}
- ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
- ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90a-NEXT: S_WAITCNT 49279
- ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
- ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
- ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
- ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
- ; GFX90a-NEXT: S_WAITCNT 49279
- ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
- ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
- ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
- ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
- ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
- ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
- ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, 1065353216, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX90a-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: test_unpacking_with_imm_input
+ ; GFX90A: liveins: $sgpr4_sgpr5
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: S_WAITCNT 49279
+ ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90A-NEXT: S_WAITCNT 49279
+ ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, 1065353216, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
S_WAITCNT 49279
@@ -1134,26 +1130,26 @@ body: |
; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 1, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
; GFX942-NEXT: S_ENDPGM 0
;
- ; GFX90a-LABEL: name: test_neg_lo_hi_post_unpacking
- ; GFX90a: liveins: $sgpr4_sgpr5
- ; GFX90a-NEXT: {{ $}}
- ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
- ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90a-NEXT: S_WAITCNT 49279
- ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
- ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
- ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
- ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
- ; GFX90a-NEXT: S_WAITCNT 49279
- ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
- ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
- ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
- ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
- ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
- ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
- ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 11, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GFX90a-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: test_neg_lo_hi_post_unpacking
+ ; GFX90A: liveins: $sgpr4_sgpr5
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: S_WAITCNT 49279
+ ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90A-NEXT: S_WAITCNT 49279
+ ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 11, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
S_WAITCNT 49279
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index 6b5bae0..c9b94e0 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -6,12 +6,12 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=SDAG-GFX12,SDAG-GFX12-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=SDAG-GFX12,SDAG-GFX12-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -global-isel < %s | FileCheck -check-prefixes=GISEL-VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -mattr=+real-true16 -global-isel < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -mattr=-real-true16 -global-isel < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX12,GISEL-GFX12-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX12,GISEL-GFX12-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -global-isel -new-reg-bank-select < %s | FileCheck -check-prefixes=GISEL-VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel -new-reg-bank-select < %s | FileCheck -check-prefixes=GISEL-GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -mattr=+real-true16 -global-isel -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -mattr=-real-true16 -global-isel -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -global-isel -new-reg-bank-select < %s | FileCheck -check-prefixes=GISEL-GFX12,GISEL-GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -global-isel -new-reg-bank-select < %s | FileCheck -check-prefixes=GISEL-GFX12,GISEL-GFX12-FAKE16 %s
; <GFX9 has no V_SAT_PK, GFX9+ has V_SAT_PK, GFX11 has V_SAT_PK with t16
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
index b5d9d00..8d0e003 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
@@ -1,21 +1,21 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
; FIXME-TRUE16. enable gisel
-; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
+; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
define i8 @test_vector_reduce_smax_v2i8(<2 x i8> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_smax_v2i8:
@@ -1632,6 +1632,7 @@ entry:
ret i8 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_smax_v2i16(<2 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_smax_v2i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -1678,7 +1679,7 @@ define i16 @test_vector_reduce_smax_v2i16(<2 x i16> %v) {
; GFX9-GISEL-LABEL: test_vector_reduce_smax_v2i16:
; GFX9-GISEL: ; %bb.0: ; %entry
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1692,7 +1693,7 @@ define i16 @test_vector_reduce_smax_v2i16(<2 x i16> %v) {
; GFX10-GISEL-LABEL: test_vector_reduce_smax_v2i16:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1713,7 +1714,7 @@ define i16 @test_vector_reduce_smax_v2i16(<2 x i16> %v) {
; GFX11-GISEL-LABEL: test_vector_reduce_smax_v2i16:
; GFX11-GISEL: ; %bb.0: ; %entry
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1747,7 +1748,7 @@ define i16 @test_vector_reduce_smax_v2i16(<2 x i16> %v) {
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1900,6 +1901,7 @@ entry:
ret i16 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_smax_v4i16(<4 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_smax_v4i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -1961,7 +1963,7 @@ define i16 @test_vector_reduce_smax_v4i16(<4 x i16> %v) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX9-GISEL-NEXT: s_nop 0
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1977,7 +1979,7 @@ define i16 @test_vector_reduce_smax_v4i16(<4 x i16> %v) {
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2003,7 +2005,7 @@ define i16 @test_vector_reduce_smax_v4i16(<4 x i16> %v) {
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2041,7 +2043,7 @@ define i16 @test_vector_reduce_smax_v4i16(<4 x i16> %v) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -2049,6 +2051,7 @@ entry:
ret i16 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_smax_v8i16(<8 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_smax_v8i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -2139,7 +2142,7 @@ define i16 @test_vector_reduce_smax_v8i16(<8 x i16> %v) {
; GFX9-GISEL-NEXT: s_nop 0
; GFX9-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX9-GISEL-NEXT: s_nop 0
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2159,7 +2162,7 @@ define i16 @test_vector_reduce_smax_v8i16(<8 x i16> %v) {
; GFX10-GISEL-NEXT: v_pk_max_i16 v0, v0, v2
; GFX10-GISEL-NEXT: v_pk_max_i16 v1, v1, v3
; GFX10-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2192,7 +2195,7 @@ define i16 @test_vector_reduce_smax_v8i16(<8 x i16> %v) {
; GFX11-GISEL-NEXT: v_pk_max_i16 v1, v1, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2238,7 +2241,7 @@ define i16 @test_vector_reduce_smax_v8i16(<8 x i16> %v) {
; GFX12-GISEL-NEXT: v_pk_max_i16 v1, v1, v3
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2247,6 +2250,7 @@ entry:
ret i16 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_smax_v16i16(<16 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_smax_v16i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -2391,7 +2395,7 @@ define i16 @test_vector_reduce_smax_v16i16(<16 x i16> %v) {
; GFX9-GISEL-NEXT: s_nop 0
; GFX9-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX9-GISEL-NEXT: s_nop 0
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2419,7 +2423,7 @@ define i16 @test_vector_reduce_smax_v16i16(<16 x i16> %v) {
; GFX10-GISEL-NEXT: v_pk_max_i16 v0, v0, v2
; GFX10-GISEL-NEXT: v_pk_max_i16 v1, v1, v3
; GFX10-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2467,7 +2471,7 @@ define i16 @test_vector_reduce_smax_v16i16(<16 x i16> %v) {
; GFX11-GISEL-NEXT: v_pk_max_i16 v1, v1, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2528,7 +2532,7 @@ define i16 @test_vector_reduce_smax_v16i16(<16 x i16> %v) {
; GFX12-GISEL-NEXT: v_pk_max_i16 v1, v1, v3
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
index 2a989ec..f15ecf0 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
@@ -1,21 +1,21 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
; FIXME-TRUE16. enable gisel
-; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
+; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
define i8 @test_vector_reduce_smin_v2i8(<2 x i8> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_smin_v2i8:
@@ -1632,6 +1632,7 @@ entry:
ret i8 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_smin_v2i16(<2 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_smin_v2i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -1678,7 +1679,7 @@ define i16 @test_vector_reduce_smin_v2i16(<2 x i16> %v) {
; GFX9-GISEL-LABEL: test_vector_reduce_smin_v2i16:
; GFX9-GISEL: ; %bb.0: ; %entry
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1692,7 +1693,7 @@ define i16 @test_vector_reduce_smin_v2i16(<2 x i16> %v) {
; GFX10-GISEL-LABEL: test_vector_reduce_smin_v2i16:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1713,7 +1714,7 @@ define i16 @test_vector_reduce_smin_v2i16(<2 x i16> %v) {
; GFX11-GISEL-LABEL: test_vector_reduce_smin_v2i16:
; GFX11-GISEL: ; %bb.0: ; %entry
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1747,7 +1748,7 @@ define i16 @test_vector_reduce_smin_v2i16(<2 x i16> %v) {
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1900,6 +1901,7 @@ entry:
ret i16 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_smin_v4i16(<4 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_smin_v4i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -1961,7 +1963,7 @@ define i16 @test_vector_reduce_smin_v4i16(<4 x i16> %v) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX9-GISEL-NEXT: s_nop 0
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1977,7 +1979,7 @@ define i16 @test_vector_reduce_smin_v4i16(<4 x i16> %v) {
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2003,7 +2005,7 @@ define i16 @test_vector_reduce_smin_v4i16(<4 x i16> %v) {
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2041,7 +2043,7 @@ define i16 @test_vector_reduce_smin_v4i16(<4 x i16> %v) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -2049,6 +2051,7 @@ entry:
ret i16 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_smin_v8i16(<8 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_smin_v8i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -2139,7 +2142,7 @@ define i16 @test_vector_reduce_smin_v8i16(<8 x i16> %v) {
; GFX9-GISEL-NEXT: s_nop 0
; GFX9-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX9-GISEL-NEXT: s_nop 0
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2159,7 +2162,7 @@ define i16 @test_vector_reduce_smin_v8i16(<8 x i16> %v) {
; GFX10-GISEL-NEXT: v_pk_min_i16 v0, v0, v2
; GFX10-GISEL-NEXT: v_pk_min_i16 v1, v1, v3
; GFX10-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2192,7 +2195,7 @@ define i16 @test_vector_reduce_smin_v8i16(<8 x i16> %v) {
; GFX11-GISEL-NEXT: v_pk_min_i16 v1, v1, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2238,7 +2241,7 @@ define i16 @test_vector_reduce_smin_v8i16(<8 x i16> %v) {
; GFX12-GISEL-NEXT: v_pk_min_i16 v1, v1, v3
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2391,7 +2394,7 @@ define i16 @test_vector_reduce_smin_v16i16(<16 x i16> %v) {
; GFX9-GISEL-NEXT: s_nop 0
; GFX9-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX9-GISEL-NEXT: s_nop 0
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2419,7 +2422,7 @@ define i16 @test_vector_reduce_smin_v16i16(<16 x i16> %v) {
; GFX10-GISEL-NEXT: v_pk_min_i16 v0, v0, v2
; GFX10-GISEL-NEXT: v_pk_min_i16 v1, v1, v3
; GFX10-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2467,7 +2470,7 @@ define i16 @test_vector_reduce_smin_v16i16(<16 x i16> %v) {
; GFX11-GISEL-NEXT: v_pk_min_i16 v1, v1, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2528,7 +2531,7 @@ define i16 @test_vector_reduce_smin_v16i16(<16 x i16> %v) {
; GFX12-GISEL-NEXT: v_pk_min_i16 v1, v1, v3
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
index 69fd58a..e62165c 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
@@ -1,21 +1,21 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
; FIXME-TRUE16. enable gisel
-; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
+; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
define i8 @test_vector_reduce_umax_v2i8(<2 x i8> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_umax_v2i8:
@@ -1525,6 +1525,7 @@ entry:
ret i8 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_umax_v2i16(<2 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_umax_v2i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -1569,7 +1570,7 @@ define i16 @test_vector_reduce_umax_v2i16(<2 x i16> %v) {
; GFX9-GISEL-LABEL: test_vector_reduce_umax_v2i16:
; GFX9-GISEL: ; %bb.0: ; %entry
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1583,7 +1584,7 @@ define i16 @test_vector_reduce_umax_v2i16(<2 x i16> %v) {
; GFX10-GISEL-LABEL: test_vector_reduce_umax_v2i16:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1604,7 +1605,7 @@ define i16 @test_vector_reduce_umax_v2i16(<2 x i16> %v) {
; GFX11-GISEL-LABEL: test_vector_reduce_umax_v2i16:
; GFX11-GISEL: ; %bb.0: ; %entry
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1638,7 +1639,7 @@ define i16 @test_vector_reduce_umax_v2i16(<2 x i16> %v) {
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1782,6 +1783,7 @@ entry:
ret i16 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_umax_v4i16(<4 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_umax_v4i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -1841,7 +1843,7 @@ define i16 @test_vector_reduce_umax_v4i16(<4 x i16> %v) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX9-GISEL-NEXT: s_nop 0
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1857,7 +1859,7 @@ define i16 @test_vector_reduce_umax_v4i16(<4 x i16> %v) {
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1883,7 +1885,7 @@ define i16 @test_vector_reduce_umax_v4i16(<4 x i16> %v) {
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1921,7 +1923,7 @@ define i16 @test_vector_reduce_umax_v4i16(<4 x i16> %v) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -1929,6 +1931,7 @@ entry:
ret i16 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_umax_v8i16(<8 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_umax_v8i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -2017,7 +2020,7 @@ define i16 @test_vector_reduce_umax_v8i16(<8 x i16> %v) {
; GFX9-GISEL-NEXT: s_nop 0
; GFX9-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX9-GISEL-NEXT: s_nop 0
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2037,7 +2040,7 @@ define i16 @test_vector_reduce_umax_v8i16(<8 x i16> %v) {
; GFX10-GISEL-NEXT: v_pk_max_u16 v0, v0, v2
; GFX10-GISEL-NEXT: v_pk_max_u16 v1, v1, v3
; GFX10-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2070,7 +2073,7 @@ define i16 @test_vector_reduce_umax_v8i16(<8 x i16> %v) {
; GFX11-GISEL-NEXT: v_pk_max_u16 v1, v1, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2116,7 +2119,7 @@ define i16 @test_vector_reduce_umax_v8i16(<8 x i16> %v) {
; GFX12-GISEL-NEXT: v_pk_max_u16 v1, v1, v3
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2125,6 +2128,7 @@ entry:
ret i16 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_umax_v16i16(<16 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_umax_v16i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -2267,7 +2271,7 @@ define i16 @test_vector_reduce_umax_v16i16(<16 x i16> %v) {
; GFX9-GISEL-NEXT: s_nop 0
; GFX9-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX9-GISEL-NEXT: s_nop 0
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2295,7 +2299,7 @@ define i16 @test_vector_reduce_umax_v16i16(<16 x i16> %v) {
; GFX10-GISEL-NEXT: v_pk_max_u16 v0, v0, v2
; GFX10-GISEL-NEXT: v_pk_max_u16 v1, v1, v3
; GFX10-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2343,7 +2347,7 @@ define i16 @test_vector_reduce_umax_v16i16(<16 x i16> %v) {
; GFX11-GISEL-NEXT: v_pk_max_u16 v1, v1, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2404,7 +2408,7 @@ define i16 @test_vector_reduce_umax_v16i16(<16 x i16> %v) {
; GFX12-GISEL-NEXT: v_pk_max_u16 v1, v1, v3
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
index 1d3b42e..83ecaaa 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
@@ -1,21 +1,21 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
; FIXME-TRUE16. enable gisel
-; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
+; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
define i8 @test_vector_reduce_umin_v2i8(<2 x i8> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_umin_v2i8:
@@ -1271,6 +1271,7 @@ entry:
ret i8 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_umin_v2i16(<2 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_umin_v2i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -1312,7 +1313,7 @@ define i16 @test_vector_reduce_umin_v2i16(<2 x i16> %v) {
; GFX9-GISEL-LABEL: test_vector_reduce_umin_v2i16:
; GFX9-GISEL: ; %bb.0: ; %entry
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1326,7 +1327,7 @@ define i16 @test_vector_reduce_umin_v2i16(<2 x i16> %v) {
; GFX10-GISEL-LABEL: test_vector_reduce_umin_v2i16:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1347,7 +1348,7 @@ define i16 @test_vector_reduce_umin_v2i16(<2 x i16> %v) {
; GFX11-GISEL-LABEL: test_vector_reduce_umin_v2i16:
; GFX11-GISEL: ; %bb.0: ; %entry
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1381,7 +1382,7 @@ define i16 @test_vector_reduce_umin_v2i16(<2 x i16> %v) {
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1527,6 +1528,7 @@ entry:
ret i16 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_umin_v4i16(<4 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_umin_v4i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -1583,7 +1585,7 @@ define i16 @test_vector_reduce_umin_v4i16(<4 x i16> %v) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX9-GISEL-NEXT: s_nop 0
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1599,7 +1601,7 @@ define i16 @test_vector_reduce_umin_v4i16(<4 x i16> %v) {
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1625,7 +1627,7 @@ define i16 @test_vector_reduce_umin_v4i16(<4 x i16> %v) {
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1663,7 +1665,7 @@ define i16 @test_vector_reduce_umin_v4i16(<4 x i16> %v) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -1671,6 +1673,7 @@ entry:
ret i16 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_umin_v8i16(<8 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_umin_v8i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -1756,7 +1759,7 @@ define i16 @test_vector_reduce_umin_v8i16(<8 x i16> %v) {
; GFX9-GISEL-NEXT: s_nop 0
; GFX9-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX9-GISEL-NEXT: s_nop 0
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1776,7 +1779,7 @@ define i16 @test_vector_reduce_umin_v8i16(<8 x i16> %v) {
; GFX10-GISEL-NEXT: v_pk_min_u16 v0, v0, v2
; GFX10-GISEL-NEXT: v_pk_min_u16 v1, v1, v3
; GFX10-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1809,7 +1812,7 @@ define i16 @test_vector_reduce_umin_v8i16(<8 x i16> %v) {
; GFX11-GISEL-NEXT: v_pk_min_u16 v1, v1, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1855,7 +1858,7 @@ define i16 @test_vector_reduce_umin_v8i16(<8 x i16> %v) {
; GFX12-GISEL-NEXT: v_pk_min_u16 v1, v1, v3
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1864,6 +1867,7 @@ entry:
ret i16 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_umin_v16i16(<16 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_umin_v16i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -2003,7 +2007,7 @@ define i16 @test_vector_reduce_umin_v16i16(<16 x i16> %v) {
; GFX9-GISEL-NEXT: s_nop 0
; GFX9-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX9-GISEL-NEXT: s_nop 0
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2031,7 +2035,7 @@ define i16 @test_vector_reduce_umin_v16i16(<16 x i16> %v) {
; GFX10-GISEL-NEXT: v_pk_min_u16 v0, v0, v2
; GFX10-GISEL-NEXT: v_pk_min_u16 v1, v1, v3
; GFX10-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2079,7 +2083,7 @@ define i16 @test_vector_reduce_umin_v16i16(<16 x i16> %v) {
; GFX11-GISEL-NEXT: v_pk_min_u16 v1, v1, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2140,7 +2144,7 @@ define i16 @test_vector_reduce_umin_v16i16(<16 x i16> %v) {
; GFX12-GISEL-NEXT: v_pk_min_u16 v1, v1, v3
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]