aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen')
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-smull.ll84
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-ccmp.ll4
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll55
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-vmul.ll161
-rw-r--r--llvm/test/CodeGen/AArch64/framelayout-split-sve.mir49
-rw-r--r--llvm/test/CodeGen/AArch64/highextractbitcast.ll26
-rw-r--r--llvm/test/CodeGen/AArch64/ldst-implicitop.mir80
-rw-r--r--llvm/test/CodeGen/AArch64/preserve_mostcc.ll38
-rw-r--r--llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll3
-rw-r--r--llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll338
-rw-r--r--llvm/test/CodeGen/AArch64/stack-hazard.ll1550
-rw-r--r--llvm/test/CodeGen/AArch64/vldn_shuffle.ll105
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll1048
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll2564
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll1050
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll3064
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll842
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll31
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll74
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll519
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/div_v2i128.ll123
-rw-r--r--llvm/test/CodeGen/AMDGPU/finalizebundle.mir13
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptoi.i128.ll266
-rw-r--r--llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll518
-rw-r--r--llvm/test/CodeGen/AMDGPU/limit-coalesce.mir63
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll74
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/packetizer.ll52
-rw-r--r--llvm/test/CodeGen/AMDGPU/private-function.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll325
-rw-r--r--llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir131
-rw-r--r--llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir22
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll466
-rw-r--r--llvm/test/CodeGen/AMDGPU/wait-xcnt.mir45
-rw-r--r--llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir42
-rw-r--r--llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll3
-rw-r--r--llvm/test/CodeGen/ARM/call-graph-section-assembly.ll3
-rw-r--r--llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll2
-rw-r--r--llvm/test/CodeGen/BPF/jump_table_blockaddr.ll4
-rw-r--r--llvm/test/CodeGen/BPF/jump_table_global_var.ll4
-rw-r--r--llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll60
-rw-r--r--llvm/test/CodeGen/DirectX/f16tof32.ll57
-rw-r--r--llvm/test/CodeGen/DirectX/wavesize-md-errs.ll31
-rw-r--r--llvm/test/CodeGen/DirectX/wavesize-md-valid.ll96
-rw-r--r--llvm/test/CodeGen/Generic/machine-function-splitter-optnone.ll50
-rw-r--r--llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll6
-rw-r--r--llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll303
-rw-r--r--llvm/test/CodeGen/NVPTX/bf16-instructions.ll343
-rw-r--r--llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll2
-rw-r--r--llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll91
-rw-r--r--llvm/test/CodeGen/PowerPC/mma-acc-memops.ll170
-rw-r--r--llvm/test/CodeGen/PowerPC/mma-acc-spill.ll102
-rw-r--r--llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll166
-rw-r--r--llvm/test/CodeGen/PowerPC/mma-intrinsics.ll517
-rw-r--r--llvm/test/CodeGen/PowerPC/mma-outer-product.ll1266
-rw-r--r--llvm/test/CodeGen/PowerPC/mma-phi-accs.ll202
-rw-r--r--llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll39
-rw-r--r--llvm/test/CodeGen/PowerPC/vec_rounding.ll195
-rw-r--r--llvm/test/CodeGen/RISCV/O3-pipeline.ll1
-rw-r--r--llvm/test/CodeGen/RISCV/cfi-multiple-locations.mir35
-rw-r--r--llvm/test/CodeGen/RISCV/features-info.ll1
-rw-r--r--llvm/test/CodeGen/RISCV/mask-variable-shift.ll132
-rw-r--r--llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll2
-rw-r--r--llvm/test/CodeGen/RISCV/riscv-promote-constant.ll148
-rw-r--r--llvm/test/CodeGen/RISCV/short-forward-branch-opt-mul.ll156
-rw-r--r--llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll4
-rw-r--r--llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll2
-rw-r--r--llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-arithmetic.ll142
-rw-r--r--llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-relational.ll376
-rw-r--r--llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_kernel_attributes/max_work_group_size.ll32
-rw-r--r--llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll2
-rw-r--r--llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll2
-rw-r--r--llvm/test/CodeGen/SPIRV/unpackfloat2x16.ll18
-rw-r--r--llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll4
-rw-r--r--llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll12
-rw-r--r--llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll3
-rw-r--r--llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll3
-rw-r--r--llvm/test/CodeGen/X86/GlobalISel/select-copy.mir136
-rw-r--r--llvm/test/CodeGen/X86/bittest-big-integer.ll1496
-rw-r--r--llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll3
-rw-r--r--llvm/test/CodeGen/X86/call-graph-section-assembly.ll3
-rw-r--r--llvm/test/CodeGen/X86/gfni-shifts.ll75
-rw-r--r--llvm/test/CodeGen/X86/pr166534.ll124
-rw-r--r--llvm/test/CodeGen/X86/vector-shift-ashr-512.ll40
-rw-r--r--llvm/test/CodeGen/X86/vector-shift-lshr-512.ll22
-rw-r--r--llvm/test/CodeGen/X86/vector-shift-shl-512.ll19
-rw-r--r--llvm/test/CodeGen/Xtensa/s32c1i.ll7
91 files changed, 13065 insertions, 7529 deletions
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index 0cd885e..e85e808 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -1,10 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NEON
; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE
-; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-
-; CHECK-GI: warning: Instruction selection used fallback path for pmlsl2_v8i16_uzp1
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for pmlsl_pmlsl2_v8i16_uzp1
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
define <8 x i16> @smull_v8i8_v8i16(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: smull_v8i8_v8i16:
@@ -1832,14 +1829,33 @@ entry:
}
define void @pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) {
-; CHECK-LABEL: pmlsl2_v8i16_uzp1:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q2, [x1, #16]
-; CHECK-NEXT: uzp1 v2.16b, v0.16b, v2.16b
-; CHECK-NEXT: pmull2 v0.8h, v0.16b, v2.16b
-; CHECK-NEXT: sub v0.8h, v1.8h, v0.8h
-; CHECK-NEXT: str q0, [x0]
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: pmlsl2_v8i16_uzp1:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: ldr q2, [x1, #16]
+; CHECK-NEON-NEXT: uzp1 v2.16b, v0.16b, v2.16b
+; CHECK-NEON-NEXT: pmull2 v0.8h, v0.16b, v2.16b
+; CHECK-NEON-NEXT: sub v0.8h, v1.8h, v0.8h
+; CHECK-NEON-NEXT: str q0, [x0]
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: pmlsl2_v8i16_uzp1:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: ldr q2, [x1, #16]
+; CHECK-SVE-NEXT: uzp1 v2.16b, v0.16b, v2.16b
+; CHECK-SVE-NEXT: pmull2 v0.8h, v0.16b, v2.16b
+; CHECK-SVE-NEXT: sub v0.8h, v1.8h, v0.8h
+; CHECK-SVE-NEXT: str q0, [x0]
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: pmlsl2_v8i16_uzp1:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q2, [x1, #16]
+; CHECK-GI-NEXT: mov d0, v0.d[1]
+; CHECK-GI-NEXT: xtn v2.8b, v2.8h
+; CHECK-GI-NEXT: pmull v0.8h, v0.8b, v2.8b
+; CHECK-GI-NEXT: sub v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT: str q0, [x0]
+; CHECK-GI-NEXT: ret
%5 = getelementptr inbounds i32, ptr %3, i64 4
%6 = load <8 x i16>, ptr %5, align 4
%7 = trunc <8 x i16> %6 to <8 x i8>
@@ -1991,16 +2007,40 @@ define void @umlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3) {
}
define void @pmlsl_pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, i32 %4) {
-; CHECK-LABEL: pmlsl_pmlsl2_v8i16_uzp1:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldp q2, q3, [x1]
-; CHECK-NEXT: uzp1 v2.16b, v2.16b, v3.16b
-; CHECK-NEXT: pmull v3.8h, v0.8b, v2.8b
-; CHECK-NEXT: pmull2 v0.8h, v0.16b, v2.16b
-; CHECK-NEXT: add v0.8h, v3.8h, v0.8h
-; CHECK-NEXT: sub v0.8h, v1.8h, v0.8h
-; CHECK-NEXT: str q0, [x0]
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: pmlsl_pmlsl2_v8i16_uzp1:
+; CHECK-NEON: // %bb.0: // %entry
+; CHECK-NEON-NEXT: ldp q2, q3, [x1]
+; CHECK-NEON-NEXT: uzp1 v2.16b, v2.16b, v3.16b
+; CHECK-NEON-NEXT: pmull v3.8h, v0.8b, v2.8b
+; CHECK-NEON-NEXT: pmull2 v0.8h, v0.16b, v2.16b
+; CHECK-NEON-NEXT: add v0.8h, v3.8h, v0.8h
+; CHECK-NEON-NEXT: sub v0.8h, v1.8h, v0.8h
+; CHECK-NEON-NEXT: str q0, [x0]
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: pmlsl_pmlsl2_v8i16_uzp1:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: ldp q2, q3, [x1]
+; CHECK-SVE-NEXT: uzp1 v2.16b, v2.16b, v3.16b
+; CHECK-SVE-NEXT: pmull v3.8h, v0.8b, v2.8b
+; CHECK-SVE-NEXT: pmull2 v0.8h, v0.16b, v2.16b
+; CHECK-SVE-NEXT: add v0.8h, v3.8h, v0.8h
+; CHECK-SVE-NEXT: sub v0.8h, v1.8h, v0.8h
+; CHECK-SVE-NEXT: str q0, [x0]
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: pmlsl_pmlsl2_v8i16_uzp1:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ldp q2, q3, [x1]
+; CHECK-GI-NEXT: mov d4, v0.d[1]
+; CHECK-GI-NEXT: xtn v2.8b, v2.8h
+; CHECK-GI-NEXT: xtn v3.8b, v3.8h
+; CHECK-GI-NEXT: pmull v0.8h, v0.8b, v2.8b
+; CHECK-GI-NEXT: pmull v2.8h, v4.8b, v3.8b
+; CHECK-GI-NEXT: add v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT: sub v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT: str q0, [x0]
+; CHECK-GI-NEXT: ret
entry:
%5 = load <8 x i16>, ptr %3, align 4
%6 = trunc <8 x i16> %5 to <8 x i8>
diff --git a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
index cad5df0..68ab890 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -430,12 +430,12 @@ declare i32 @foo()
; Test case distilled from 126.gcc.
; The phi in sw.bb.i.i gets multiple operands for the %entry predecessor.
-define void @build_modify_expr() nounwind ssp {
+define void @build_modify_expr(i32 %cond) nounwind ssp {
; CHECK-LABEL: build_modify_expr:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: ret
entry:
- switch i32 undef, label %sw.bb.i.i [
+ switch i32 %cond, label %sw.bb.i.i [
i32 69, label %if.end85
i32 70, label %if.end85
i32 71, label %if.end85
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
index 2a8b3ce2..8cb319b 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
@@ -1,11 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-
-; CHECK-GI: warning: Instruction selection used fallback path for test_vmull_p8
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_vmull_high_p8
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_vmull_p64
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_vmull_high_p64
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>)
declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) #5
@@ -2721,14 +2716,24 @@ entry:
}
define i128 @test_vmull_p64(i64 %a, i64 %b) #4 {
-; CHECK-LABEL: test_vmull_p64:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov d0, x1
-; CHECK-NEXT: fmov d1, x0
-; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d
-; CHECK-NEXT: mov x1, v0.d[1]
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_vmull_p64:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fmov d0, x1
+; CHECK-SD-NEXT: fmov d1, x0
+; CHECK-SD-NEXT: pmull v0.1q, v1.1d, v0.1d
+; CHECK-SD-NEXT: mov x1, v0.d[1]
+; CHECK-SD-NEXT: fmov x0, d0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_vmull_p64:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: fmov d0, x0
+; CHECK-GI-NEXT: fmov d1, x1
+; CHECK-GI-NEXT: pmull v0.1q, v0.1d, v1.1d
+; CHECK-GI-NEXT: mov d1, v0.d[1]
+; CHECK-GI-NEXT: fmov x0, d0
+; CHECK-GI-NEXT: fmov x1, d1
+; CHECK-GI-NEXT: ret
entry:
%vmull2.i = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %a, i64 %b)
%vmull3.i = bitcast <16 x i8> %vmull2.i to i128
@@ -2736,12 +2741,22 @@ entry:
}
define i128 @test_vmull_high_p64(<2 x i64> %a, <2 x i64> %b) #4 {
-; CHECK-LABEL: test_vmull_high_p64:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: pmull2 v0.1q, v0.2d, v1.2d
-; CHECK-NEXT: mov x1, v0.d[1]
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_vmull_high_p64:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: pmull2 v0.1q, v0.2d, v1.2d
+; CHECK-SD-NEXT: mov x1, v0.d[1]
+; CHECK-SD-NEXT: fmov x0, d0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_vmull_high_p64:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov d0, v0.d[1]
+; CHECK-GI-NEXT: mov d1, v1.d[1]
+; CHECK-GI-NEXT: pmull v0.1q, v0.1d, v1.1d
+; CHECK-GI-NEXT: mov d1, v0.d[1]
+; CHECK-GI-NEXT: fmov x0, d0
+; CHECK-GI-NEXT: fmov x1, d1
+; CHECK-GI-NEXT: ret
entry:
%0 = extractelement <2 x i64> %a, i32 1
%1 = extractelement <2 x i64> %b, i32 1
diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
index e6df9f2..90abc7d 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
@@ -2,44 +2,35 @@
; RUN: llc -mtriple=aarch64-none-elf -mattr=+aes < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
; RUN: llc -mtriple=aarch64-none-elf -mattr=+aes -global-isel -global-isel-abort=2 2>&1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-; CHECK-GI: warning: Instruction selection used fallback path for pmull8h
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for commutable_pmull8h
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmulh_1s
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_2s
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_4s
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_2d
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_commuted_neg_2s
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_commuted_neg_4s
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_commuted_neg_2d
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2s
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_4s
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2d
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2s_strict
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_4s_strict
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2d_strict
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_2s_strict
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_4s_strict
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_2d_strict
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmulh_lane_1s
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlal_lane_1d
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlsl_lane_1d
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for pmull_from_extract_dup_low
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for pmull_from_extract_dup_high
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for pmull_from_extract_duplane_low
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for pmull_from_extract_duplane_high
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v4f32
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v2f32
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v2f64
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f32
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f32_1
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v4f32
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v4f32_1
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f64
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlal_d
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlsl_d
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_pmull_64
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_pmull_high_64
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_commutable_pmull_64
+; CHECK-GI: warning: Instruction selection used fallback path for sqdmulh_1s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_2s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_4s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_2d
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_commuted_neg_2s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_commuted_neg_4s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_commuted_neg_2d
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_4s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2d
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2s_strict
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_4s_strict
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2d_strict
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_2s_strict
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_4s_strict
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_2d_strict
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmulh_lane_1s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlal_lane_1d
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlsl_lane_1d
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v4f32
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v2f32
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v2f64
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f32
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f32_1
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v4f32
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v4f32_1
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f64
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlal_d
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlsl_d
define <8 x i16> @smull8h(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: smull8h:
@@ -2895,11 +2886,18 @@ define <8 x i16> @pmull_from_extract_dup_low(<16 x i8> %lhs, i8 %rhs) {
}
define <8 x i16> @pmull_from_extract_dup_high(<16 x i8> %lhs, i8 %rhs) {
-; CHECK-LABEL: pmull_from_extract_dup_high:
-; CHECK: // %bb.0:
-; CHECK-NEXT: dup v1.16b, w0
-; CHECK-NEXT: pmull2 v0.8h, v0.16b, v1.16b
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: pmull_from_extract_dup_high:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: dup v1.16b, w0
+; CHECK-SD-NEXT: pmull2 v0.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: pmull_from_extract_dup_high:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: dup v1.8b, w0
+; CHECK-GI-NEXT: mov d0, v0.d[1]
+; CHECK-GI-NEXT: pmull v0.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: ret
%rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
%rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -2924,12 +2922,20 @@ define <8 x i16> @pmull_from_extract_duplane_low(<16 x i8> %lhs, <8 x i8> %rhs)
}
define <8 x i16> @pmull_from_extract_duplane_high(<16 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK-LABEL: pmull_from_extract_duplane_high:
-; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: dup v1.16b, v1.b[0]
-; CHECK-NEXT: pmull2 v0.8h, v0.16b, v1.16b
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: pmull_from_extract_duplane_high:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT: dup v1.16b, v1.b[0]
+; CHECK-SD-NEXT: pmull2 v0.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: pmull_from_extract_duplane_high:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT: mov d0, v0.d[1]
+; CHECK-GI-NEXT: dup v1.8b, v1.b[0]
+; CHECK-GI-NEXT: pmull v0.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: ret
%lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -3245,21 +3251,35 @@ define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind {
}
define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind {
-; CHECK-LABEL: test_pmull_64:
-; CHECK: // %bb.0:
-; CHECK-NEXT: fmov d0, x1
-; CHECK-NEXT: fmov d1, x0
-; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_pmull_64:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: fmov d0, x1
+; CHECK-SD-NEXT: fmov d1, x0
+; CHECK-SD-NEXT: pmull v0.1q, v1.1d, v0.1d
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_pmull_64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: fmov d0, x0
+; CHECK-GI-NEXT: fmov d1, x1
+; CHECK-GI-NEXT: pmull v0.1q, v0.1d, v1.1d
+; CHECK-GI-NEXT: ret
%val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
ret <16 x i8> %val
}
define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind {
-; CHECK-LABEL: test_pmull_high_64:
-; CHECK: // %bb.0:
-; CHECK-NEXT: pmull2 v0.1q, v0.2d, v1.2d
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_pmull_high_64:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: pmull2 v0.1q, v0.2d, v1.2d
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_pmull_high_64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov d0, v0.d[1]
+; CHECK-GI-NEXT: mov d1, v1.d[1]
+; CHECK-GI-NEXT: pmull v0.1q, v0.1d, v1.1d
+; CHECK-GI-NEXT: ret
%l_hi = extractelement <2 x i64> %l, i32 1
%r_hi = extractelement <2 x i64> %r, i32 1
%val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l_hi, i64 %r_hi)
@@ -3267,13 +3287,22 @@ define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind {
}
define <16 x i8> @test_commutable_pmull_64(i64 %l, i64 %r) nounwind {
-; CHECK-LABEL: test_commutable_pmull_64:
-; CHECK: // %bb.0:
-; CHECK-NEXT: fmov d0, x1
-; CHECK-NEXT: fmov d1, x0
-; CHECK-NEXT: pmull v0.1q, v1.1d, v0.1d
-; CHECK-NEXT: add v0.16b, v0.16b, v0.16b
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_commutable_pmull_64:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: fmov d0, x1
+; CHECK-SD-NEXT: fmov d1, x0
+; CHECK-SD-NEXT: pmull v0.1q, v1.1d, v0.1d
+; CHECK-SD-NEXT: add v0.16b, v0.16b, v0.16b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_commutable_pmull_64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: fmov d0, x0
+; CHECK-GI-NEXT: fmov d1, x1
+; CHECK-GI-NEXT: pmull v2.1q, v0.1d, v1.1d
+; CHECK-GI-NEXT: pmull v0.1q, v1.1d, v0.1d
+; CHECK-GI-NEXT: add v0.16b, v2.16b, v0.16b
+; CHECK-GI-NEXT: ret
%1 = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
%2 = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %r, i64 %l)
%3 = add <16 x i8> %1, %2
diff --git a/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
index f535e0f..bb7ffb4 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
@@ -162,63 +162,54 @@ body: |
RET_ReallyLR
# CHECK-LABEL: name: test_allocate_split_sve_realigned
-# CHECK: stackSize: 2080
+# CHECK: stackSize: 1056
# CHECK: bb.0.entry:
# CHECK: liveins: $z0, $p0, $lr
-# CHECK: $sp = frame-setup SUBXri $sp, 1040, 0
-# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040
-# CHECK-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.5)
-# CHECK-NEXT: frame-setup STRXui killed $lr, $sp, 129 :: (store (s64) into %stack.4)
-# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 1024, 0
+# CHECK: early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.5), (store (s64) into %stack.4)
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0
# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16
# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8
# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
-# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 1040, 0
-# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $[[TMP]], -2, implicit $vg
-# CHECK-NEXT: $sp = frame-setup ANDXri killed $x9, 7930
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 2064, 0
+# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $x9, -3, implicit $vg
+# CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]], 7930
#
# CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0
# CHECK-NEXT: $x8 = ADDPL_XXI $x8, -1, implicit $vg
-# CHECK-NEXT: STR_ZXI $z0, killed $x8, -1 :: (store (<vscale x 1 x s128>) into %stack.0)
-# CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0
-# CHECK-NEXT: STR_PXI $p0, killed $x8, -15 :: (store (<vscale x 1 x s16>) into %stack.1)
+# CHECK-NEXT: STR_ZXI $z0, killed $x8, -2 :: (store (<vscale x 1 x s128>) into %stack.0)
+# CHECK-NEXT: STR_PXI $p0, $fp, -6 :: (store (<vscale x 1 x s16>) into %stack.1)
#
-# CHECK-NEXT: $sp = frame-destroy SUBXri $fp, 1024, 0
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1040
-# CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 129 :: (load (s64) from %stack.4)
-# CHECK-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.5)
-# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+# CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
+# CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.5), (load (s64) from %stack.4)
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
# CHECK-NEXT: RET_ReallyLR
# ASM-LABEL: test_allocate_split_sve_realigned
-# ASM: sub sp, sp, #1040
-# ASM-NEXT: .cfi_def_cfa_offset 1040
-# ASM-NEXT: str x29, [sp, #1024]
-# ASM-NEXT: str x30, [sp, #1032]
-# ASM-NEXT: add x29, sp, #1024
+# ASM: stp x29, x30, [sp, #-16]!
+# ASM-NEXT: .cfi_def_cfa_offset 16
+# ASM-NEXT: mov x29, sp
# ASM-NEXT: .cfi_def_cfa w29, 16
# ASM-NEXT: .cfi_offset w30, -8
# ASM-NEXT: .cfi_offset w29, -16
#
-# ASM: sub sp, x29, #1024
-# ASM-NEXT: .cfi_def_cfa wsp, 1040
-# ASM-NEXT: ldr x30, [sp, #1032]
-# ASM-NEXT: ldr x29, [sp, #1024]
-# ASM-NEXT: add sp, sp, #1040
+# ASM: mov sp, x29
+# ASM-NEXT: .cfi_def_cfa wsp, 16
+# ASM-NEXT: ldp x29, x30, [sp], #16
# ASM-NEXT: .cfi_def_cfa_offset 0
# ASM-NEXT: .cfi_restore w30
# ASM-NEXT: .cfi_restore w29
-# UNWINDINFO: DW_CFA_def_cfa_offset: +1040
+# UNWINDINFO: DW_CFA_def_cfa_offset: +16
# UNWINDINFO: DW_CFA_def_cfa: reg29 +16
# UNWINDINFO-NEXT: DW_CFA_offset: reg30 -8
# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
#
-# UNWINDINFO: DW_CFA_def_cfa: reg31 +1040
+# UNWINDINFO: DW_CFA_def_cfa: reg31 +16
# UNWINDINFO: DW_CFA_def_cfa_offset: +0
# UNWINDINFO-NEXT: DW_CFA_restore: reg30
# UNWINDINFO-NEXT: DW_CFA_restore: reg29
diff --git a/llvm/test/CodeGen/AArch64/highextractbitcast.ll b/llvm/test/CodeGen/AArch64/highextractbitcast.ll
index df4889b..bd6c168 100644
--- a/llvm/test/CodeGen/AArch64/highextractbitcast.ll
+++ b/llvm/test/CodeGen/AArch64/highextractbitcast.ll
@@ -1,10 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes CHECK,CHECK-LE
; RUN: llc -mtriple=aarch64_be-unknown-linux-gnu < %s | FileCheck %s --check-prefix CHECK-BE
-; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel -global-isel -global-isel-abort=2 2>&1 < %s | FileCheck %s --check-prefixes CHECK,CHECK-GI
-
-; CHECK-GI: warning: Instruction selection used fallback path for test_pmull_high_p8_128
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_pmull_high_p8_64
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel < %s | FileCheck %s --check-prefixes CHECK,CHECK-GI
declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
@@ -521,12 +518,12 @@ entry:
}
define <8 x i16> @test_pmull_high_p8_128(i128 %aa, i128 %bb) {
-; CHECK-LABEL: test_pmull_high_p8_128:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov d0, x3
-; CHECK-NEXT: fmov d1, x1
-; CHECK-NEXT: pmull v0.8h, v1.8b, v0.8b
-; CHECK-NEXT: ret
+; CHECK-LE-LABEL: test_pmull_high_p8_128:
+; CHECK-LE: // %bb.0: // %entry
+; CHECK-LE-NEXT: fmov d0, x3
+; CHECK-LE-NEXT: fmov d1, x1
+; CHECK-LE-NEXT: pmull v0.8h, v1.8b, v0.8b
+; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test_pmull_high_p8_128:
; CHECK-BE: // %bb.0: // %entry
@@ -538,6 +535,15 @@ define <8 x i16> @test_pmull_high_p8_128(i128 %aa, i128 %bb) {
; CHECK-BE-NEXT: rev64 v0.8h, v0.8h
; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-BE-NEXT: ret
+;
+; CHECK-GI-LABEL: test_pmull_high_p8_128:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v0.d[0], x0
+; CHECK-GI-NEXT: mov v1.d[0], x2
+; CHECK-GI-NEXT: mov v0.d[1], x1
+; CHECK-GI-NEXT: mov v1.d[1], x3
+; CHECK-GI-NEXT: pmull2 v0.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT: ret
entry:
%a = bitcast i128 %aa to <16 x i8>
%b = bitcast i128 %bb to <16 x i8>
diff --git a/llvm/test/CodeGen/AArch64/ldst-implicitop.mir b/llvm/test/CodeGen/AArch64/ldst-implicitop.mir
new file mode 100644
index 0000000..34e8cf2
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ldst-implicitop.mir
@@ -0,0 +1,80 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=aarch64-- -run-pass=aarch64-ldst-opt -verify-machineinstrs -o - %s | FileCheck %s
+# Check that we copy implicit operands.
+---
+name: impdef_op1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $lr
+ ; CHECK-LABEL: name: impdef_op1
+ ; CHECK: liveins: $lr
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $q5, renamable $q20 = LDPQi renamable $lr, 3, implicit-def $q4_q5 :: (load (s128))
+ ; CHECK-NEXT: $q0 = ORRv16i8 $q4, killed $q4
+ ; CHECK-NEXT: $q1 = ORRv16i8 $q5, killed $q5
+ ; CHECK-NEXT: RET_ReallyLR
+ renamable $q5 = LDRQui renamable $lr, 3, implicit-def $q4_q5 :: (load (s128))
+ renamable $q20 = LDRQui renamable $lr, 4 :: (load (s128))
+ $q0 = ORRv16i8 $q4, killed $q4
+ $q1 = ORRv16i8 $q5, killed $q5
+ RET_ReallyLR
+...
+---
+name: impdef_op2
+body: |
+ bb.0:
+ liveins: $lr
+ ; CHECK-LABEL: name: impdef_op2
+ ; CHECK: liveins: $lr
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $q20, renamable $q5 = LDPQi renamable $lr, 3, implicit-def $q4_q5 :: (load (s128))
+ ; CHECK-NEXT: $q0 = ORRv16i8 $q4, killed $q4
+ ; CHECK-NEXT: $q1 = ORRv16i8 $q5, killed $q5
+ ; CHECK-NEXT: RET_ReallyLR
+ renamable $q20 = LDRQui renamable $lr, 3 :: (load (s128))
+ renamable $q5 = LDRQui renamable $lr, 4, implicit-def $q4_q5 :: (load (s128))
+ $q0 = ORRv16i8 $q4, killed $q4
+ $q1 = ORRv16i8 $q5, killed $q5
+ RET_ReallyLR
+...
+---
+name: impdef_both
+body: |
+ bb.0:
+ liveins: $lr
+ ; CHECK-LABEL: name: impdef_both
+ ; CHECK: liveins: $lr
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $q5, renamable $q20 = LDPQi renamable $lr, 3, implicit-def $q4_q5, implicit-def $q20_q21 :: (load (s128))
+ ; CHECK-NEXT: $q0 = ORRv16i8 $q4, killed $q4
+ ; CHECK-NEXT: $q1 = ORRv16i8 $q5, killed $q5
+ ; CHECK-NEXT: $q2 = ORRv16i8 $q20, killed $q20
+ ; CHECK-NEXT: $q3 = ORRv16i8 $q21, killed $q21
+ ; CHECK-NEXT: RET_ReallyLR
+ renamable $q5 = LDRQui renamable $lr, 3, implicit-def $q4_q5 :: (load (s128))
+ renamable $q20 = LDRQui renamable $lr, 4, implicit-def $q20_q21 :: (load (s128))
+ $q0 = ORRv16i8 $q4, killed $q4
+ $q1 = ORRv16i8 $q5, killed $q5
+ $q2 = ORRv16i8 $q20, killed $q20
+ $q3 = ORRv16i8 $q21, killed $q21
+ RET_ReallyLR
+...
+---
+name: impdef_both_same
+body: |
+ bb.0:
+ liveins: $lr
+ ; CHECK-LABEL: name: impdef_both_same
+ ; CHECK: liveins: $lr
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $q5, renamable $q20 = LDPQi renamable $lr, 3, implicit-def $q4_q5 :: (load (s128))
+ ; CHECK-NEXT: $q0 = ORRv16i8 $q4, killed $q4
+ ; CHECK-NEXT: $q1 = ORRv16i8 $q5, killed $q5
+ ; CHECK-NEXT: RET_ReallyLR
+ renamable $q5 = LDRQui renamable $lr, 3, implicit-def $q4_q5 :: (load (s128))
+ renamable $q20 = LDRQui renamable $lr, 4, implicit-def $q4_q5 :: (load (s128))
+ $q0 = ORRv16i8 $q4, killed $q4
+ $q1 = ORRv16i8 $q5, killed $q5
+ RET_ReallyLR
+...
diff --git a/llvm/test/CodeGen/AArch64/preserve_mostcc.ll b/llvm/test/CodeGen/AArch64/preserve_mostcc.ll
index 7f0968c..f77ada4 100644
--- a/llvm/test/CodeGen/AArch64/preserve_mostcc.ll
+++ b/llvm/test/CodeGen/AArch64/preserve_mostcc.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios-8.0.0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-ios-8.0.0 | FileCheck -check-prefix CHECK -check-prefix CHECK-DARWIN %s
+; RUN: llc < %s -mtriple=aarch64-unknown-windows-msvc | FileCheck -check-prefix CHECK -check-prefix CHECK-WIN %s
declare void @standard_cc_func()
declare preserve_mostcc void @preserve_mostcc_func()
@@ -8,18 +9,26 @@ declare preserve_mostcc void @preserve_mostcc_func()
define preserve_mostcc void @preserve_mostcc1() nounwind {
entry:
;CHECK-LABEL: preserve_mostcc1
-;CHECK-NOT: stp
-;CHECK-NOT: str
-;CHECK: str x15
-;CHECK-NEXT: stp x14, x13,
-;CHECK-NEXT: stp x12, x11,
-;CHECK-NEXT: stp x10, x9,
-;CHECK: bl _standard_cc_func
+;CHECK-DARWIN-NOT: stp
+;CHECK-DARWIN-NOT: str
+;CHECK-DARWIN: str x15
+;CHECK-DARWIN-NEXT: stp x14, x13,
+;CHECK-DARWIN-NEXT: stp x12, x11,
+;CHECK-DARWIN-NEXT: stp x10, x9,
+;CHECK-WIN: stp x15, x14
+;CHECK-WIN-NEXT: stp x13, x12,
+;CHECK-WIN-NEXT: stp x11, x10,
+;CHECK-WIN-NEXT: stp x9, x30
+;CHECK: bl {{_?}}standard_cc_func
call void @standard_cc_func()
-;CHECK: ldp x10, x9,
-;CHECK-NEXT: ldp x12, x11,
-;CHECK-NEXT: ldp x14, x13,
-;CHECK-NEXT: ldr x15
+;CHECK-DARWIN: ldp x10, x9,
+;CHECK-DARWIN-NEXT: ldp x12, x11,
+;CHECK-DARWIN-NEXT: ldp x14, x13,
+;CHECK-DARWIN-NEXT: ldr x15
+;CHECK-WIN: ldp x9, x30
+;CHECK-WIN-NEXT: ldp x11, x10,
+;CHECK-WIN-NEXT: ldp x13, x12,
+;CHECK-WIN-NEXT: ldp x15, x14,
ret void
}
@@ -31,9 +40,10 @@ define preserve_mostcc void @preserve_mostcc2() nounwind {
entry:
;CHECK-LABEL: preserve_mostcc2
;CHECK-NOT: x14
-;CHECK: stp x29, x30,
+;CHECK-DARWIN: stp x29, x30,
+;CHECK-WIN: str x30
;CHECK-NOT: x14
-;CHECK: bl _preserve_mostcc_func
+;CHECK: bl {{_?}}preserve_mostcc_func
call preserve_mostcc void @preserve_mostcc_func()
ret void
}
diff --git a/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll b/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll
index 71c6380..8a0ac6d 100644
--- a/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll
+++ b/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll
@@ -780,6 +780,7 @@ define <vscale x 4 x float> @llvm_tanh_vscale_f32(<vscale x 4 x float> %in) #0 {
attributes #0 = { "target-features"="+sve" }
;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
; CHECK: attributes #[[ATTR1]] = { "target-features"="+sve" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
;.
diff --git a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
index c13dd33..f65aec6 100644
--- a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
+++ b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
@@ -737,36 +737,23 @@ entry:
}
declare ptr @memset(ptr, i32, i32)
-; FIXME: aarch64-split-sve-objects is currently not supported in this function
-; as it requires stack reealignment (for the 32-byte aligned alloca).
-; GPR CSRs
-; <hazard padding>
-; FPR CSRs
-; <hazrd padding>
-; <SVE locals (PPRs and ZPRs)> <--- hazard between PPRs and ZPRs here!
-; <realignment padding>
-; -> sp
define void @zpr_and_ppr_local_realignment(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector, i64 %gpr) "aarch64_pstate_sm_compatible" {
; CHECK-LABEL: zpr_and_ppr_local_realignment:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #1040
-; CHECK-NEXT: sub x9, sp, #1040
-; CHECK-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK-NEXT: add x29, sp, #1024
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: sub x9, sp, #2064
+; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: addvl x9, x9, #-2
-; CHECK-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0
; CHECK-NEXT: .cfi_def_cfa w29, 16
; CHECK-NEXT: .cfi_offset w30, -8
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: sub x8, x29, #1024
-; CHECK-NEXT: str p0, [x8, #-1, mul vl]
+; CHECK-NEXT: str p0, [x29, #-1, mul vl]
; CHECK-NEXT: str z0, [x8, #-2, mul vl]
; CHECK-NEXT: str x0, [sp]
-; CHECK-NEXT: sub sp, x29, #1024
-; CHECK-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #1040
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-NEXT: ret
%ppr_local = alloca <vscale x 16 x i1>
%zpr_local = alloca <vscale x 16 x i8>
@@ -805,3 +792,316 @@ define void @zpr_and_ppr_local_stack_probing(<vscale x 16 x i1> %pred, <vscale x
store volatile i64 %gpr, ptr %gpr_local
ret void
}
+
+; Only PPR callee-saves + a VLA
+; Expect: No hazard padding. Frame pointer (x29), p4-p6 callee saves allocated
+; with `addvl #-1`, PPR saves restored using frame pointer `addvl sp, x29, #-1`.
+define aarch64_sve_vector_pcs void @only_ppr_csr_vla(i64 %n) {
+; CHECK-LABEL: only_ppr_csr_vla:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov x19, sp
+; CHECK-NEXT: .cfi_def_cfa w29, 32
+; CHECK-NEXT: .cfi_offset w19, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: add x9, x0, #15
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT: sub x8, x8, x9
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: // fake_use: $x8
+; CHECK-NEXT: //APP
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: addvl sp, x29, #-1
+; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %alloc = alloca i8, i64 %n, align 1
+ call void (...) @llvm.fake.use(ptr %alloc)
+ tail call void asm sideeffect "", "~{p4},~{p5},~{p6}"()
+ ret void
+}
+
+; Only ZPR callee-saves + a VLA
+; Expect: Hazard padding, Frame pointer (x29), z8-z10 callee saves allocated
+; with `addvl #-3`. ZPR saves restored from `FP - 1024 + addvl #-3`.
+define aarch64_sve_vector_pcs void @only_zpr_csr_vla(i64 %n) {
+; CHECK-LABEL: only_zpr_csr_vla:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #1056
+; CHECK-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK-NEXT: add x29, sp, #1024
+; CHECK-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #1040] // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: str z10, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: mov x19, sp
+; CHECK-NEXT: .cfi_def_cfa w29, 32
+; CHECK-NEXT: .cfi_offset w19, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1056
+; CHECK-NEXT: add x9, x0, #15
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT: sub x8, x8, x9
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: // fake_use: $x8
+; CHECK-NEXT: sub x8, x29, #1024
+; CHECK-NEXT: //APP
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: addvl sp, x8, #-3
+; CHECK-NEXT: ldr z10, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: sub sp, x29, #1024
+; CHECK-NEXT: ldr x19, [sp, #1040] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #1056
+; CHECK-NEXT: ret
+ %alloc = alloca i8, i64 %n, align 1
+ call void (...) @llvm.fake.use(ptr %alloc)
+ tail call void asm sideeffect "", "~{z8},~{z9},~{z10}"()
+ ret void
+}
+
+; PPR+ZPR callee-saves + a VLA
+; Expect: Hazard padding, Frame pointer (x29), PPR (p4-p6) and ZPR (z8-z10)
+; callee-saves allocated separately, with hazard padding of 1024 between the
+; areas. ZPR callee saves restored by `FP - 1024 + addvl #-4`, PPR callee saves
+; restored by `FP + addvl #-1`.
+define aarch64_sve_vector_pcs void @zpr_ppr_csr_vla(i64 %n) {
+; CHECK-LABEL: zpr_ppr_csr_vla:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: str z10, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: mov x19, sp
+; CHECK-NEXT: .cfi_def_cfa w29, 32
+; CHECK-NEXT: .cfi_offset w19, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 24 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 32 * VG - 1056
+; CHECK-NEXT: add x9, x0, #15
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT: sub x8, x8, x9
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: // fake_use: $x8
+; CHECK-NEXT: sub x8, x29, #1024
+; CHECK-NEXT: //APP
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: addvl sp, x8, #-4
+; CHECK-NEXT: ldr z10, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: addvl sp, x29, #-1
+; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %alloc = alloca i8, i64 %n, align 1
+ call void (...) @llvm.fake.use(ptr %alloc)
+ tail call void asm sideeffect "", "~{p4},~{p5},~{p6},~{z8},~{z9},~{z10}"()
+ ret void
+}
+
+; Only PPR callee-saves (and ZPR/PPR locals) + a VLA
+; Expect: Hazard padding, Frame pointer (x29), PPR (p4-p6) callee-saves, with
+; hazard padding after the PPR callee saves (1024) and after the FPR local area
+; (1024) -- coeleased to 2048. Only PPRs restored by moving the SP to
+; `FP + addvl #-1`.
+define void @sve_locals_only_ppr_csr_vla(i64 %n, <vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) {
+; CHECK-LABEL: sve_locals_only_ppr_csr_vla:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #2048
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: mov x19, sp
+; CHECK-NEXT: .cfi_def_cfa w29, 32
+; CHECK-NEXT: .cfi_offset w19, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: add x9, x0, #15
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT: sub x8, x8, x9
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: // fake_use: $x8
+; CHECK-NEXT: sub x8, x29, #1024
+; CHECK-NEXT: //APP
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: str p0, [x29, #-9, mul vl]
+; CHECK-NEXT: str z0, [x8, #-3, mul vl]
+; CHECK-NEXT: addvl sp, x29, #-1
+; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %alloc = alloca i8, i64 %n, align 1
+ %ppr_local = alloca <vscale x 16 x i1>
+ %zpr_local = alloca <vscale x 16 x i8>
+ tail call void asm sideeffect "", "~{p4},~{p5},~{p6}"()
+ call void (...) @llvm.fake.use(ptr %alloc)
+ store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+ store volatile <vscale x 16 x i8> %vector, ptr %zpr_local
+ ret void
+}
+
+; Only ZPR callee-saves (and ZPR/PPR locals) + a VLA
+; Expect: Hazard padding, Frame pointer (x29), ZPR (z8-z10) callee-saves, with
+; hazard padding before the ZPR callee saves (1024) and after the ZPR local area
+; (1024). Only ZPRs restored by moving the SP to `FP - 1024 + addvl #-4`.
+define void @sve_locals_only_zpr_csr_vla(i64 %n, <vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) {
+; CHECK-LABEL: sve_locals_only_zpr_csr_vla:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-4
+; CHECK-NEXT: str z10, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov x19, sp
+; CHECK-NEXT: .cfi_def_cfa w29, 32
+; CHECK-NEXT: .cfi_offset w19, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 24 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 32 * VG - 1056
+; CHECK-NEXT: add x9, x0, #15
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT: sub x8, x8, x9
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: // fake_use: $x8
+; CHECK-NEXT: sub x8, x29, #1024
+; CHECK-NEXT: //APP
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: str p0, [x29, #-1, mul vl]
+; CHECK-NEXT: str z0, [x8, #-5, mul vl]
+; CHECK-NEXT: addvl sp, x8, #-4
+; CHECK-NEXT: ldr z10, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %alloc = alloca i8, i64 %n, align 1
+ %ppr_local = alloca <vscale x 16 x i1>
+ %zpr_local = alloca <vscale x 16 x i8>
+ tail call void asm sideeffect "", "~{z8},~{z9},~{z10}"()
+ call void (...) @llvm.fake.use(ptr %alloc)
+ store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+ store volatile <vscale x 16 x i8> %vector, ptr %zpr_local
+ ret void
+}
+
+; PPR+ZPR callee-saves (and ZPR/PPR locals) + a VLA
+; Expect: Hazard padding, Frame pointer (x29), PPR (p4-p6) and ZPR (z8-z10)
+; callee-saves, with hazard padding before the ZPR callee saves (1024) and after
+; the ZPR local area (1024). ZPRs restored by moving the SP to
+; `FP - 1024 + addvl #-5`, PPRs restored by moving SP to `FP + addvl #-1`.
+define void @sve_locals_zpr_ppr_csr_vla(i64 %n, <vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) {
+; CHECK-LABEL: sve_locals_zpr_ppr_csr_vla:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-4
+; CHECK-NEXT: str z10, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov x19, sp
+; CHECK-NEXT: .cfi_def_cfa w29, 32
+; CHECK-NEXT: .cfi_offset w19, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 24 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 32 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 40 * VG - 1056
+; CHECK-NEXT: add x9, x0, #15
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT: sub x8, x8, x9
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: // fake_use: $x8
+; CHECK-NEXT: sub x8, x29, #1024
+; CHECK-NEXT: //APP
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: str p0, [x29, #-9, mul vl]
+; CHECK-NEXT: str z0, [x8, #-6, mul vl]
+; CHECK-NEXT: addvl sp, x8, #-5
+; CHECK-NEXT: ldr z10, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: addvl sp, x29, #-1
+; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %alloc = alloca i8, i64 %n, align 1
+ %ppr_local = alloca <vscale x 16 x i1>
+ %zpr_local = alloca <vscale x 16 x i8>
+ tail call void asm sideeffect "", "~{p4},~{p5},~{p6},~{z8},~{z9},~{z10}"()
+ call void (...) @llvm.fake.use(ptr %alloc)
+ store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+ store volatile <vscale x 16 x i8> %vector, ptr %zpr_local
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll
index bdee359..7087476 100644
--- a/llvm/test/CodeGen/AArch64/stack-hazard.ll
+++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll
@@ -3512,14 +3512,13 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
;
; CHECK64-LABEL: svecc_call_dynamic_alloca:
; CHECK64: // %bb.0: // %entry
-; CHECK64-NEXT: sub sp, sp, #128
-; CHECK64-NEXT: .cfi_def_cfa_offset 128
+; CHECK64-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK64-NEXT: .cfi_def_cfa_offset 64
; CHECK64-NEXT: cntd x9
-; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK64-NEXT: stp x9, x28, [sp, #80] // 16-byte Folded Spill
-; CHECK64-NEXT: stp x27, x26, [sp, #96] // 16-byte Folded Spill
-; CHECK64-NEXT: stp x20, x19, [sp, #112] // 16-byte Folded Spill
-; CHECK64-NEXT: add x29, sp, #64
+; CHECK64-NEXT: stp x27, x26, [sp, #32] // 16-byte Folded Spill
+; CHECK64-NEXT: stp x9, x28, [sp, #16] // 16-byte Folded Spill
+; CHECK64-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK64-NEXT: mov x29, sp
; CHECK64-NEXT: .cfi_def_cfa w29, 64
; CHECK64-NEXT: .cfi_offset w19, -8
; CHECK64-NEXT: .cfi_offset w20, -16
@@ -3529,7 +3528,7 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
; CHECK64-NEXT: .cfi_offset vg, -48
; CHECK64-NEXT: .cfi_offset w30, -56
; CHECK64-NEXT: .cfi_offset w29, -64
-; CHECK64-NEXT: addvl sp, sp, #-18
+; CHECK64-NEXT: addvl sp, sp, #-2
; CHECK64-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
; CHECK64-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
; CHECK64-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
@@ -3542,30 +3541,32 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
; CHECK64-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
; CHECK64-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
; CHECK64-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK64-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * IncomingVG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * IncomingVG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * IncomingVG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * IncomingVG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * IncomingVG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * IncomingVG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * IncomingVG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * IncomingVG - 128
+; CHECK64-NEXT: sub sp, sp, #64
+; CHECK64-NEXT: addvl sp, sp, #-16
+; CHECK64-NEXT: str z23, [sp] // 16-byte Folded Spill
+; CHECK64-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 24 * IncomingVG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 32 * IncomingVG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 40 * IncomingVG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 48 * IncomingVG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 56 * IncomingVG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 64 * IncomingVG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 72 * IncomingVG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 80 * IncomingVG - 128
; CHECK64-NEXT: sub sp, sp, #64
; CHECK64-NEXT: mov x19, sp
; CHECK64-NEXT: mov w2, w1
@@ -3595,22 +3596,31 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
; CHECK64-NEXT: sub x8, x29, #64
; CHECK64-NEXT: movk w0, #59491, lsl #16
; CHECK64-NEXT: addvl sp, x8, #-18
-; CHECK64-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z23, [sp] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: addvl sp, x29, #-2
+; CHECK64-NEXT: .cfi_restore z8
+; CHECK64-NEXT: .cfi_restore z9
+; CHECK64-NEXT: .cfi_restore z10
+; CHECK64-NEXT: .cfi_restore z11
+; CHECK64-NEXT: .cfi_restore z12
+; CHECK64-NEXT: .cfi_restore z13
+; CHECK64-NEXT: .cfi_restore z14
+; CHECK64-NEXT: .cfi_restore z15
; CHECK64-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK64-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK64-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
@@ -3623,21 +3633,12 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
; CHECK64-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK64-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
; CHECK64-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT: .cfi_restore z8
-; CHECK64-NEXT: .cfi_restore z9
-; CHECK64-NEXT: .cfi_restore z10
-; CHECK64-NEXT: .cfi_restore z11
-; CHECK64-NEXT: .cfi_restore z12
-; CHECK64-NEXT: .cfi_restore z13
-; CHECK64-NEXT: .cfi_restore z14
-; CHECK64-NEXT: .cfi_restore z15
-; CHECK64-NEXT: sub sp, x29, #64
-; CHECK64-NEXT: .cfi_def_cfa wsp, 128
-; CHECK64-NEXT: ldp x20, x19, [sp, #112] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr x28, [sp, #88] // 8-byte Folded Reload
-; CHECK64-NEXT: ldp x27, x26, [sp, #96] // 16-byte Folded Reload
-; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK64-NEXT: add sp, sp, #128
+; CHECK64-NEXT: mov sp, x29
+; CHECK64-NEXT: .cfi_def_cfa wsp, 64
+; CHECK64-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr x28, [sp, #24] // 8-byte Folded Reload
+; CHECK64-NEXT: ldp x27, x26, [sp, #32] // 16-byte Folded Reload
+; CHECK64-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload
; CHECK64-NEXT: .cfi_def_cfa_offset 0
; CHECK64-NEXT: .cfi_restore w19
; CHECK64-NEXT: .cfi_restore w20
@@ -3649,151 +3650,291 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
; CHECK64-NEXT: .cfi_restore w29
; CHECK64-NEXT: ret
;
-; CHECK1024-LABEL: svecc_call_dynamic_alloca:
-; CHECK1024: // %bb.0: // %entry
-; CHECK1024-NEXT: sub sp, sp, #1088
-; CHECK1024-NEXT: .cfi_def_cfa_offset 1088
-; CHECK1024-NEXT: cntd x9
-; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x20, [sp, #1072] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x19, [sp, #1080] // 8-byte Folded Spill
-; CHECK1024-NEXT: add x29, sp, #1024
-; CHECK1024-NEXT: .cfi_def_cfa w29, 64
-; CHECK1024-NEXT: .cfi_offset w19, -8
-; CHECK1024-NEXT: .cfi_offset w20, -16
-; CHECK1024-NEXT: .cfi_offset w26, -24
-; CHECK1024-NEXT: .cfi_offset w27, -32
-; CHECK1024-NEXT: .cfi_offset w28, -40
-; CHECK1024-NEXT: .cfi_offset vg, -48
-; CHECK1024-NEXT: .cfi_offset w30, -56
-; CHECK1024-NEXT: .cfi_offset w29, -64
-; CHECK1024-NEXT: addvl sp, sp, #-18
-; CHECK1024-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
-; CHECK1024-NEXT: sub sp, sp, #1024
-; CHECK1024-NEXT: mov x19, sp
-; CHECK1024-NEXT: mov w2, w1
-; CHECK1024-NEXT: mov w8, w0
-; CHECK1024-NEXT: bl __arm_sme_state
-; CHECK1024-NEXT: mov w8, w8
-; CHECK1024-NEXT: mov x9, sp
-; CHECK1024-NEXT: mov x20, x0
-; CHECK1024-NEXT: add x8, x8, #15
-; CHECK1024-NEXT: and x8, x8, #0x1fffffff0
-; CHECK1024-NEXT: sub x8, x9, x8
-; CHECK1024-NEXT: mov sp, x8
-; CHECK1024-NEXT: //APP
-; CHECK1024-NEXT: //NO_APP
-; CHECK1024-NEXT: tbz w20, #0, .LBB35_2
-; CHECK1024-NEXT: // %bb.1: // %entry
-; CHECK1024-NEXT: smstop sm
-; CHECK1024-NEXT: .LBB35_2: // %entry
-; CHECK1024-NEXT: mov x0, x8
-; CHECK1024-NEXT: mov w1, #45 // =0x2d
-; CHECK1024-NEXT: bl memset
-; CHECK1024-NEXT: tbz w20, #0, .LBB35_4
-; CHECK1024-NEXT: // %bb.3: // %entry
-; CHECK1024-NEXT: smstart sm
-; CHECK1024-NEXT: .LBB35_4: // %entry
-; CHECK1024-NEXT: mov w0, #22647 // =0x5877
-; CHECK1024-NEXT: sub x8, x29, #1024
-; CHECK1024-NEXT: movk w0, #59491, lsl #16
-; CHECK1024-NEXT: addvl sp, x8, #-18
-; CHECK1024-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: .cfi_restore z8
-; CHECK1024-NEXT: .cfi_restore z9
-; CHECK1024-NEXT: .cfi_restore z10
-; CHECK1024-NEXT: .cfi_restore z11
-; CHECK1024-NEXT: .cfi_restore z12
-; CHECK1024-NEXT: .cfi_restore z13
-; CHECK1024-NEXT: .cfi_restore z14
-; CHECK1024-NEXT: .cfi_restore z15
-; CHECK1024-NEXT: sub sp, x29, #1024
-; CHECK1024-NEXT: .cfi_def_cfa wsp, 1088
-; CHECK1024-NEXT: ldr x19, [sp, #1080] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x20, [sp, #1072] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT: add sp, sp, #1088
-; CHECK1024-NEXT: .cfi_def_cfa_offset 0
-; CHECK1024-NEXT: .cfi_restore w19
-; CHECK1024-NEXT: .cfi_restore w20
-; CHECK1024-NEXT: .cfi_restore w26
-; CHECK1024-NEXT: .cfi_restore w27
-; CHECK1024-NEXT: .cfi_restore w28
-; CHECK1024-NEXT: .cfi_restore vg
-; CHECK1024-NEXT: .cfi_restore w30
-; CHECK1024-NEXT: .cfi_restore w29
-; CHECK1024-NEXT: ret
+; CHECK1024-NOSPLITSVE-LABEL: svecc_call_dynamic_alloca:
+; CHECK1024-NOSPLITSVE: // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 1088
+; CHECK1024-NOSPLITSVE-NEXT: cntd x9
+; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x20, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x19, [sp, #1080] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: add x29, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa w29, 64
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w19, -8
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w20, -16
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w26, -24
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w27, -32
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w28, -40
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset vg, -48
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w30, -56
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -64
+; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-18
+; CHECK1024-NOSPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT: mov x19, sp
+; CHECK1024-NOSPLITSVE-NEXT: mov w2, w1
+; CHECK1024-NOSPLITSVE-NEXT: mov w8, w0
+; CHECK1024-NOSPLITSVE-NEXT: bl __arm_sme_state
+; CHECK1024-NOSPLITSVE-NEXT: mov w8, w8
+; CHECK1024-NOSPLITSVE-NEXT: mov x9, sp
+; CHECK1024-NOSPLITSVE-NEXT: mov x20, x0
+; CHECK1024-NOSPLITSVE-NEXT: add x8, x8, #15
+; CHECK1024-NOSPLITSVE-NEXT: and x8, x8, #0x1fffffff0
+; CHECK1024-NOSPLITSVE-NEXT: sub x8, x9, x8
+; CHECK1024-NOSPLITSVE-NEXT: mov sp, x8
+; CHECK1024-NOSPLITSVE-NEXT: //APP
+; CHECK1024-NOSPLITSVE-NEXT: //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT: tbz w20, #0, .LBB35_2
+; CHECK1024-NOSPLITSVE-NEXT: // %bb.1: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: smstop sm
+; CHECK1024-NOSPLITSVE-NEXT: .LBB35_2: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: mov x0, x8
+; CHECK1024-NOSPLITSVE-NEXT: mov w1, #45 // =0x2d
+; CHECK1024-NOSPLITSVE-NEXT: bl memset
+; CHECK1024-NOSPLITSVE-NEXT: tbz w20, #0, .LBB35_4
+; CHECK1024-NOSPLITSVE-NEXT: // %bb.3: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: smstart sm
+; CHECK1024-NOSPLITSVE-NEXT: .LBB35_4: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: mov w0, #22647 // =0x5877
+; CHECK1024-NOSPLITSVE-NEXT: sub x8, x29, #1024
+; CHECK1024-NOSPLITSVE-NEXT: movk w0, #59491, lsl #16
+; CHECK1024-NOSPLITSVE-NEXT: addvl sp, x8, #-18
+; CHECK1024-NOSPLITSVE-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z8
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z9
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z10
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z11
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z12
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z13
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z14
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z15
+; CHECK1024-NOSPLITSVE-NEXT: sub sp, x29, #1024
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa wsp, 1088
+; CHECK1024-NOSPLITSVE-NEXT: ldr x19, [sp, #1080] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x20, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 0
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w19
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w20
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w26
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w27
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w28
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore vg
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w30
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w29
+; CHECK1024-NOSPLITSVE-NEXT: ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_call_dynamic_alloca:
+; CHECK1024-SPLITSVE: // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 64
+; CHECK1024-SPLITSVE-NEXT: cntd x9
+; CHECK1024-SPLITSVE-NEXT: stp x27, x26, [sp, #32] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: stp x9, x28, [sp, #16] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: mov x29, sp
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa w29, 64
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w19, -8
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w20, -16
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w26, -24
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w27, -32
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w28, -40
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset vg, -48
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w30, -56
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -64
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-2
+; CHECK1024-SPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-16
+; CHECK1024-SPLITSVE-NEXT: str z23, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: mov x19, sp
+; CHECK1024-SPLITSVE-NEXT: mov w2, w1
+; CHECK1024-SPLITSVE-NEXT: mov w8, w0
+; CHECK1024-SPLITSVE-NEXT: bl __arm_sme_state
+; CHECK1024-SPLITSVE-NEXT: mov w8, w8
+; CHECK1024-SPLITSVE-NEXT: mov x9, sp
+; CHECK1024-SPLITSVE-NEXT: mov x20, x0
+; CHECK1024-SPLITSVE-NEXT: add x8, x8, #15
+; CHECK1024-SPLITSVE-NEXT: and x8, x8, #0x1fffffff0
+; CHECK1024-SPLITSVE-NEXT: sub x8, x9, x8
+; CHECK1024-SPLITSVE-NEXT: mov sp, x8
+; CHECK1024-SPLITSVE-NEXT: //APP
+; CHECK1024-SPLITSVE-NEXT: //NO_APP
+; CHECK1024-SPLITSVE-NEXT: tbz w20, #0, .LBB35_2
+; CHECK1024-SPLITSVE-NEXT: // %bb.1: // %entry
+; CHECK1024-SPLITSVE-NEXT: smstop sm
+; CHECK1024-SPLITSVE-NEXT: .LBB35_2: // %entry
+; CHECK1024-SPLITSVE-NEXT: mov x0, x8
+; CHECK1024-SPLITSVE-NEXT: mov w1, #45 // =0x2d
+; CHECK1024-SPLITSVE-NEXT: bl memset
+; CHECK1024-SPLITSVE-NEXT: tbz w20, #0, .LBB35_4
+; CHECK1024-SPLITSVE-NEXT: // %bb.3: // %entry
+; CHECK1024-SPLITSVE-NEXT: smstart sm
+; CHECK1024-SPLITSVE-NEXT: .LBB35_4: // %entry
+; CHECK1024-SPLITSVE-NEXT: mov w0, #22647 // =0x5877
+; CHECK1024-SPLITSVE-NEXT: sub x8, x29, #1024
+; CHECK1024-SPLITSVE-NEXT: movk w0, #59491, lsl #16
+; CHECK1024-SPLITSVE-NEXT: addvl sp, x8, #-18
+; CHECK1024-SPLITSVE-NEXT: ldr z23, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: addvl sp, x29, #-2
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z8
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z9
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z10
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z11
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z12
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z13
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z14
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z15
+; CHECK1024-SPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: mov sp, x29
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa wsp, 64
+; CHECK1024-SPLITSVE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr x28, [sp, #24] // 8-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldp x27, x26, [sp, #32] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 0
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w19
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w20
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w26
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w27
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w28
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore vg
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w30
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w29
+; CHECK1024-SPLITSVE-NEXT: ret
entry:
%ptr = alloca i8, i32 %P1
tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
@@ -3931,23 +4072,22 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
;
; CHECK64-LABEL: svecc_call_realign:
; CHECK64: // %bb.0: // %entry
-; CHECK64-NEXT: sub sp, sp, #128
-; CHECK64-NEXT: .cfi_def_cfa_offset 128
+; CHECK64-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK64-NEXT: .cfi_def_cfa_offset 64
; CHECK64-NEXT: cntd x9
-; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK64-NEXT: stp x9, x28, [sp, #80] // 16-byte Folded Spill
-; CHECK64-NEXT: stp x27, x26, [sp, #96] // 16-byte Folded Spill
-; CHECK64-NEXT: str x19, [sp, #112] // 8-byte Folded Spill
-; CHECK64-NEXT: add x29, sp, #64
+; CHECK64-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK64-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK64-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK64-NEXT: mov x29, sp
; CHECK64-NEXT: .cfi_def_cfa w29, 64
-; CHECK64-NEXT: .cfi_offset w19, -16
-; CHECK64-NEXT: .cfi_offset w26, -24
-; CHECK64-NEXT: .cfi_offset w27, -32
-; CHECK64-NEXT: .cfi_offset w28, -40
+; CHECK64-NEXT: .cfi_offset w19, -8
+; CHECK64-NEXT: .cfi_offset w26, -16
+; CHECK64-NEXT: .cfi_offset w27, -24
+; CHECK64-NEXT: .cfi_offset w28, -32
; CHECK64-NEXT: .cfi_offset vg, -48
; CHECK64-NEXT: .cfi_offset w30, -56
; CHECK64-NEXT: .cfi_offset w29, -64
-; CHECK64-NEXT: addvl sp, sp, #-18
+; CHECK64-NEXT: addvl sp, sp, #-2
; CHECK64-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
; CHECK64-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
; CHECK64-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
@@ -3960,30 +4100,32 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
; CHECK64-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
; CHECK64-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
; CHECK64-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK64-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * IncomingVG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * IncomingVG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * IncomingVG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * IncomingVG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * IncomingVG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * IncomingVG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * IncomingVG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * IncomingVG - 128
+; CHECK64-NEXT: sub sp, sp, #64
+; CHECK64-NEXT: addvl sp, sp, #-16
+; CHECK64-NEXT: str z23, [sp] // 16-byte Folded Spill
+; CHECK64-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 24 * IncomingVG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 32 * IncomingVG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 40 * IncomingVG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 48 * IncomingVG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 56 * IncomingVG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 64 * IncomingVG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 72 * IncomingVG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 80 * IncomingVG - 128
; CHECK64-NEXT: sub x9, sp, #1088
; CHECK64-NEXT: and sp, x9, #0xffffffffffffffe0
; CHECK64-NEXT: mov w2, w1
@@ -4006,22 +4148,31 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
; CHECK64-NEXT: sub x8, x29, #64
; CHECK64-NEXT: movk w0, #59491, lsl #16
; CHECK64-NEXT: addvl sp, x8, #-18
-; CHECK64-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z23, [sp] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: addvl sp, x29, #-2
+; CHECK64-NEXT: .cfi_restore z8
+; CHECK64-NEXT: .cfi_restore z9
+; CHECK64-NEXT: .cfi_restore z10
+; CHECK64-NEXT: .cfi_restore z11
+; CHECK64-NEXT: .cfi_restore z12
+; CHECK64-NEXT: .cfi_restore z13
+; CHECK64-NEXT: .cfi_restore z14
+; CHECK64-NEXT: .cfi_restore z15
; CHECK64-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK64-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK64-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
@@ -4034,20 +4185,11 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
; CHECK64-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK64-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
; CHECK64-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT: .cfi_restore z8
-; CHECK64-NEXT: .cfi_restore z9
-; CHECK64-NEXT: .cfi_restore z10
-; CHECK64-NEXT: .cfi_restore z11
-; CHECK64-NEXT: .cfi_restore z12
-; CHECK64-NEXT: .cfi_restore z13
-; CHECK64-NEXT: .cfi_restore z14
-; CHECK64-NEXT: .cfi_restore z15
-; CHECK64-NEXT: sub sp, x29, #64
-; CHECK64-NEXT: .cfi_def_cfa wsp, 128
-; CHECK64-NEXT: ldp x26, x19, [sp, #104] // 16-byte Folded Reload
-; CHECK64-NEXT: ldp x28, x27, [sp, #88] // 16-byte Folded Reload
-; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK64-NEXT: add sp, sp, #128
+; CHECK64-NEXT: mov sp, x29
+; CHECK64-NEXT: .cfi_def_cfa wsp, 64
+; CHECK64-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK64-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK64-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload
; CHECK64-NEXT: .cfi_def_cfa_offset 0
; CHECK64-NEXT: .cfi_restore w19
; CHECK64-NEXT: .cfi_restore w26
@@ -4058,140 +4200,270 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
; CHECK64-NEXT: .cfi_restore w29
; CHECK64-NEXT: ret
;
-; CHECK1024-LABEL: svecc_call_realign:
-; CHECK1024: // %bb.0: // %entry
-; CHECK1024-NEXT: sub sp, sp, #1088
-; CHECK1024-NEXT: .cfi_def_cfa_offset 1088
-; CHECK1024-NEXT: cntd x9
-; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill
-; CHECK1024-NEXT: add x29, sp, #1024
-; CHECK1024-NEXT: .cfi_def_cfa w29, 64
-; CHECK1024-NEXT: .cfi_offset w19, -16
-; CHECK1024-NEXT: .cfi_offset w26, -24
-; CHECK1024-NEXT: .cfi_offset w27, -32
-; CHECK1024-NEXT: .cfi_offset w28, -40
-; CHECK1024-NEXT: .cfi_offset vg, -48
-; CHECK1024-NEXT: .cfi_offset w30, -56
-; CHECK1024-NEXT: .cfi_offset w29, -64
-; CHECK1024-NEXT: addvl sp, sp, #-18
-; CHECK1024-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
-; CHECK1024-NEXT: sub x9, sp, #2048
-; CHECK1024-NEXT: and sp, x9, #0xffffffffffffffe0
-; CHECK1024-NEXT: mov w2, w1
-; CHECK1024-NEXT: bl __arm_sme_state
-; CHECK1024-NEXT: mov x19, x0
-; CHECK1024-NEXT: //APP
-; CHECK1024-NEXT: //NO_APP
-; CHECK1024-NEXT: tbz w19, #0, .LBB36_2
-; CHECK1024-NEXT: // %bb.1: // %entry
-; CHECK1024-NEXT: smstop sm
-; CHECK1024-NEXT: .LBB36_2: // %entry
-; CHECK1024-NEXT: mov x0, sp
-; CHECK1024-NEXT: mov w1, #45 // =0x2d
-; CHECK1024-NEXT: bl memset
-; CHECK1024-NEXT: tbz w19, #0, .LBB36_4
-; CHECK1024-NEXT: // %bb.3: // %entry
-; CHECK1024-NEXT: smstart sm
-; CHECK1024-NEXT: .LBB36_4: // %entry
-; CHECK1024-NEXT: mov w0, #22647 // =0x5877
-; CHECK1024-NEXT: sub x8, x29, #1024
-; CHECK1024-NEXT: movk w0, #59491, lsl #16
-; CHECK1024-NEXT: addvl sp, x8, #-18
-; CHECK1024-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: .cfi_restore z8
-; CHECK1024-NEXT: .cfi_restore z9
-; CHECK1024-NEXT: .cfi_restore z10
-; CHECK1024-NEXT: .cfi_restore z11
-; CHECK1024-NEXT: .cfi_restore z12
-; CHECK1024-NEXT: .cfi_restore z13
-; CHECK1024-NEXT: .cfi_restore z14
-; CHECK1024-NEXT: .cfi_restore z15
-; CHECK1024-NEXT: sub sp, x29, #1024
-; CHECK1024-NEXT: .cfi_def_cfa wsp, 1088
-; CHECK1024-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT: add sp, sp, #1088
-; CHECK1024-NEXT: .cfi_def_cfa_offset 0
-; CHECK1024-NEXT: .cfi_restore w19
-; CHECK1024-NEXT: .cfi_restore w26
-; CHECK1024-NEXT: .cfi_restore w27
-; CHECK1024-NEXT: .cfi_restore w28
-; CHECK1024-NEXT: .cfi_restore vg
-; CHECK1024-NEXT: .cfi_restore w30
-; CHECK1024-NEXT: .cfi_restore w29
-; CHECK1024-NEXT: ret
+; CHECK1024-NOSPLITSVE-LABEL: svecc_call_realign:
+; CHECK1024-NOSPLITSVE: // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 1088
+; CHECK1024-NOSPLITSVE-NEXT: cntd x9
+; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: add x29, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa w29, 64
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w19, -16
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w26, -24
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w27, -32
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w28, -40
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset vg, -48
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w30, -56
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -64
+; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-18
+; CHECK1024-NOSPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: sub x9, sp, #2048
+; CHECK1024-NOSPLITSVE-NEXT: and sp, x9, #0xffffffffffffffe0
+; CHECK1024-NOSPLITSVE-NEXT: mov w2, w1
+; CHECK1024-NOSPLITSVE-NEXT: bl __arm_sme_state
+; CHECK1024-NOSPLITSVE-NEXT: mov x19, x0
+; CHECK1024-NOSPLITSVE-NEXT: //APP
+; CHECK1024-NOSPLITSVE-NEXT: //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB36_2
+; CHECK1024-NOSPLITSVE-NEXT: // %bb.1: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: smstop sm
+; CHECK1024-NOSPLITSVE-NEXT: .LBB36_2: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: mov x0, sp
+; CHECK1024-NOSPLITSVE-NEXT: mov w1, #45 // =0x2d
+; CHECK1024-NOSPLITSVE-NEXT: bl memset
+; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB36_4
+; CHECK1024-NOSPLITSVE-NEXT: // %bb.3: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: smstart sm
+; CHECK1024-NOSPLITSVE-NEXT: .LBB36_4: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: mov w0, #22647 // =0x5877
+; CHECK1024-NOSPLITSVE-NEXT: sub x8, x29, #1024
+; CHECK1024-NOSPLITSVE-NEXT: movk w0, #59491, lsl #16
+; CHECK1024-NOSPLITSVE-NEXT: addvl sp, x8, #-18
+; CHECK1024-NOSPLITSVE-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z8
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z9
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z10
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z11
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z12
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z13
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z14
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z15
+; CHECK1024-NOSPLITSVE-NEXT: sub sp, x29, #1024
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa wsp, 1088
+; CHECK1024-NOSPLITSVE-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 0
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w19
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w26
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w27
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w28
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore vg
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w30
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w29
+; CHECK1024-NOSPLITSVE-NEXT: ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_call_realign:
+; CHECK1024-SPLITSVE: // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 64
+; CHECK1024-SPLITSVE-NEXT: cntd x9
+; CHECK1024-SPLITSVE-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: mov x29, sp
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa w29, 64
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w19, -8
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w26, -16
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w27, -24
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w28, -32
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset vg, -48
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w30, -56
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -64
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-2
+; CHECK1024-SPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-16
+; CHECK1024-SPLITSVE-NEXT: str z23, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: sub x9, sp, #2048
+; CHECK1024-SPLITSVE-NEXT: and sp, x9, #0xffffffffffffffe0
+; CHECK1024-SPLITSVE-NEXT: mov w2, w1
+; CHECK1024-SPLITSVE-NEXT: bl __arm_sme_state
+; CHECK1024-SPLITSVE-NEXT: mov x19, x0
+; CHECK1024-SPLITSVE-NEXT: //APP
+; CHECK1024-SPLITSVE-NEXT: //NO_APP
+; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB36_2
+; CHECK1024-SPLITSVE-NEXT: // %bb.1: // %entry
+; CHECK1024-SPLITSVE-NEXT: smstop sm
+; CHECK1024-SPLITSVE-NEXT: .LBB36_2: // %entry
+; CHECK1024-SPLITSVE-NEXT: mov x0, sp
+; CHECK1024-SPLITSVE-NEXT: mov w1, #45 // =0x2d
+; CHECK1024-SPLITSVE-NEXT: bl memset
+; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB36_4
+; CHECK1024-SPLITSVE-NEXT: // %bb.3: // %entry
+; CHECK1024-SPLITSVE-NEXT: smstart sm
+; CHECK1024-SPLITSVE-NEXT: .LBB36_4: // %entry
+; CHECK1024-SPLITSVE-NEXT: mov w0, #22647 // =0x5877
+; CHECK1024-SPLITSVE-NEXT: sub x8, x29, #1024
+; CHECK1024-SPLITSVE-NEXT: movk w0, #59491, lsl #16
+; CHECK1024-SPLITSVE-NEXT: addvl sp, x8, #-18
+; CHECK1024-SPLITSVE-NEXT: ldr z23, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: addvl sp, x29, #-2
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z8
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z9
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z10
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z11
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z12
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z13
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z14
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z15
+; CHECK1024-SPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: mov sp, x29
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa wsp, 64
+; CHECK1024-SPLITSVE-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 0
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w19
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w26
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w27
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w28
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore vg
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w30
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w29
+; CHECK1024-SPLITSVE-NEXT: ret
entry:
%ptr = alloca i8, i32 1000, align 32
tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
@@ -4311,13 +4583,12 @@ define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 %
;
; CHECK64-LABEL: svecc_call_dynamic_and_scalable_alloca:
; CHECK64: // %bb.0: // %entry
-; CHECK64-NEXT: sub sp, sp, #128
-; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK64-NEXT: add x29, sp, #64
-; CHECK64-NEXT: stp x28, x27, [sp, #80] // 16-byte Folded Spill
-; CHECK64-NEXT: stp x26, x20, [sp, #96] // 16-byte Folded Spill
-; CHECK64-NEXT: str x19, [sp, #112] // 8-byte Folded Spill
-; CHECK64-NEXT: addvl sp, sp, #-18
+; CHECK64-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK64-NEXT: str x28, [sp, #16] // 8-byte Folded Spill
+; CHECK64-NEXT: mov x29, sp
+; CHECK64-NEXT: stp x27, x26, [sp, #32] // 16-byte Folded Spill
+; CHECK64-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK64-NEXT: addvl sp, sp, #-2
; CHECK64-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
; CHECK64-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
; CHECK64-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
@@ -4330,41 +4601,43 @@ define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 %
; CHECK64-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
; CHECK64-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
; CHECK64-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK64-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: sub sp, sp, #64
+; CHECK64-NEXT: addvl sp, sp, #-16
+; CHECK64-NEXT: str z23, [sp] // 16-byte Folded Spill
+; CHECK64-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill
; CHECK64-NEXT: sub sp, sp, #112
; CHECK64-NEXT: addvl sp, sp, #-1
; CHECK64-NEXT: mov x19, sp
; CHECK64-NEXT: .cfi_def_cfa w29, 64
-; CHECK64-NEXT: .cfi_offset w19, -16
-; CHECK64-NEXT: .cfi_offset w20, -24
-; CHECK64-NEXT: .cfi_offset w26, -32
-; CHECK64-NEXT: .cfi_offset w27, -40
+; CHECK64-NEXT: .cfi_offset w19, -8
+; CHECK64-NEXT: .cfi_offset w20, -16
+; CHECK64-NEXT: .cfi_offset w26, -24
+; CHECK64-NEXT: .cfi_offset w27, -32
; CHECK64-NEXT: .cfi_offset w28, -48
; CHECK64-NEXT: .cfi_offset w30, -56
; CHECK64-NEXT: .cfi_offset w29, -64
-; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * VG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * VG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * VG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * VG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * VG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * VG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * VG - 128
-; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * VG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 24 * VG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 32 * VG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 40 * VG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 48 * VG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 56 * VG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 64 * VG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 72 * VG - 128
+; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 80 * VG - 128
; CHECK64-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK64-NEXT: ubfiz x8, x0, #2, #32
; CHECK64-NEXT: mov x9, sp
@@ -4385,22 +4658,23 @@ define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 %
; CHECK64-NEXT: sub x8, x29, #64
; CHECK64-NEXT: movk w0, #59491, lsl #16
; CHECK64-NEXT: addvl sp, x8, #-18
-; CHECK64-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z23, [sp] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT: addvl sp, x29, #-2
; CHECK64-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK64-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK64-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
@@ -4413,131 +4687,243 @@ define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 %
; CHECK64-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK64-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
; CHECK64-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT: sub sp, x29, #64
-; CHECK64-NEXT: ldp x20, x19, [sp, #104] // 16-byte Folded Reload
-; CHECK64-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
-; CHECK64-NEXT: ldp x27, x26, [sp, #88] // 16-byte Folded Reload
-; CHECK64-NEXT: ldp x30, x28, [sp, #72] // 16-byte Folded Reload
-; CHECK64-NEXT: add sp, sp, #128
+; CHECK64-NEXT: mov sp, x29
+; CHECK64-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK64-NEXT: ldr x28, [sp, #16] // 8-byte Folded Reload
+; CHECK64-NEXT: ldp x27, x26, [sp, #32] // 16-byte Folded Reload
+; CHECK64-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload
; CHECK64-NEXT: ret
;
-; CHECK1024-LABEL: svecc_call_dynamic_and_scalable_alloca:
-; CHECK1024: // %bb.0: // %entry
-; CHECK1024-NEXT: sub sp, sp, #1088
-; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT: add x29, sp, #1024
-; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x28, [sp, #1040] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x27, [sp, #1048] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x26, [sp, #1056] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x20, [sp, #1064] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill
-; CHECK1024-NEXT: addvl sp, sp, #-18
-; CHECK1024-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: sub sp, sp, #1072
-; CHECK1024-NEXT: addvl sp, sp, #-1
-; CHECK1024-NEXT: mov x19, sp
-; CHECK1024-NEXT: .cfi_def_cfa w29, 64
-; CHECK1024-NEXT: .cfi_offset w19, -16
-; CHECK1024-NEXT: .cfi_offset w20, -24
-; CHECK1024-NEXT: .cfi_offset w26, -32
-; CHECK1024-NEXT: .cfi_offset w27, -40
-; CHECK1024-NEXT: .cfi_offset w28, -48
-; CHECK1024-NEXT: .cfi_offset w30, -56
-; CHECK1024-NEXT: .cfi_offset w29, -64
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * VG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * VG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * VG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * VG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * VG - 1088
-; CHECK1024-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK1024-NEXT: ubfiz x8, x0, #2, #32
-; CHECK1024-NEXT: mov x9, sp
-; CHECK1024-NEXT: add x8, x8, #15
-; CHECK1024-NEXT: and x8, x8, #0x7fffffff0
-; CHECK1024-NEXT: sub x20, x9, x8
-; CHECK1024-NEXT: mov sp, x20
-; CHECK1024-NEXT: //APP
-; CHECK1024-NEXT: //NO_APP
-; CHECK1024-NEXT: add x0, x19, #8
-; CHECK1024-NEXT: bl bar
-; CHECK1024-NEXT: sub x0, x29, #1024
-; CHECK1024-NEXT: addvl x0, x0, #-19
-; CHECK1024-NEXT: bl bar
-; CHECK1024-NEXT: mov x0, x20
-; CHECK1024-NEXT: bl bar
-; CHECK1024-NEXT: mov w0, #22647 // =0x5877
-; CHECK1024-NEXT: sub x8, x29, #1024
-; CHECK1024-NEXT: movk w0, #59491, lsl #16
-; CHECK1024-NEXT: addvl sp, x8, #-18
-; CHECK1024-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: sub sp, x29, #1024
-; CHECK1024-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x20, [sp, #1064] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x26, [sp, #1056] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x27, [sp, #1048] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x28, [sp, #1040] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT: add sp, sp, #1088
-; CHECK1024-NEXT: ret
+; CHECK1024-NOSPLITSVE-LABEL: svecc_call_dynamic_and_scalable_alloca:
+; CHECK1024-NOSPLITSVE: // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: add x29, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x28, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x27, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x26, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x20, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-18
+; CHECK1024-NOSPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1072
+; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-1
+; CHECK1024-NOSPLITSVE-NEXT: mov x19, sp
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa w29, 64
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w19, -16
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w20, -24
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w26, -32
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w27, -40
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w28, -48
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w30, -56
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -64
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK1024-NOSPLITSVE-NEXT: ubfiz x8, x0, #2, #32
+; CHECK1024-NOSPLITSVE-NEXT: mov x9, sp
+; CHECK1024-NOSPLITSVE-NEXT: add x8, x8, #15
+; CHECK1024-NOSPLITSVE-NEXT: and x8, x8, #0x7fffffff0
+; CHECK1024-NOSPLITSVE-NEXT: sub x20, x9, x8
+; CHECK1024-NOSPLITSVE-NEXT: mov sp, x20
+; CHECK1024-NOSPLITSVE-NEXT: //APP
+; CHECK1024-NOSPLITSVE-NEXT: //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT: add x0, x19, #8
+; CHECK1024-NOSPLITSVE-NEXT: bl bar
+; CHECK1024-NOSPLITSVE-NEXT: sub x0, x29, #1024
+; CHECK1024-NOSPLITSVE-NEXT: addvl x0, x0, #-19
+; CHECK1024-NOSPLITSVE-NEXT: bl bar
+; CHECK1024-NOSPLITSVE-NEXT: mov x0, x20
+; CHECK1024-NOSPLITSVE-NEXT: bl bar
+; CHECK1024-NOSPLITSVE-NEXT: mov w0, #22647 // =0x5877
+; CHECK1024-NOSPLITSVE-NEXT: sub x8, x29, #1024
+; CHECK1024-NOSPLITSVE-NEXT: movk w0, #59491, lsl #16
+; CHECK1024-NOSPLITSVE-NEXT: addvl sp, x8, #-18
+; CHECK1024-NOSPLITSVE-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: sub sp, x29, #1024
+; CHECK1024-NOSPLITSVE-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x20, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x26, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x27, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x28, [sp, #1040] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT: ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_call_dynamic_and_scalable_alloca:
+; CHECK1024-SPLITSVE: // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str x28, [sp, #16] // 8-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: mov x29, sp
+; CHECK1024-SPLITSVE-NEXT: stp x27, x26, [sp, #32] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-2
+; CHECK1024-SPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-16
+; CHECK1024-SPLITSVE-NEXT: str z23, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1072
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-1
+; CHECK1024-SPLITSVE-NEXT: mov x19, sp
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa w29, 64
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w19, -8
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w20, -16
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w26, -24
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w27, -32
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w28, -48
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w30, -56
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -64
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK1024-SPLITSVE-NEXT: ubfiz x8, x0, #2, #32
+; CHECK1024-SPLITSVE-NEXT: mov x9, sp
+; CHECK1024-SPLITSVE-NEXT: add x8, x8, #15
+; CHECK1024-SPLITSVE-NEXT: and x8, x8, #0x7fffffff0
+; CHECK1024-SPLITSVE-NEXT: sub x20, x9, x8
+; CHECK1024-SPLITSVE-NEXT: mov sp, x20
+; CHECK1024-SPLITSVE-NEXT: //APP
+; CHECK1024-SPLITSVE-NEXT: //NO_APP
+; CHECK1024-SPLITSVE-NEXT: add x0, x19, #8
+; CHECK1024-SPLITSVE-NEXT: bl bar
+; CHECK1024-SPLITSVE-NEXT: sub x0, x29, #1024
+; CHECK1024-SPLITSVE-NEXT: addvl x0, x0, #-19
+; CHECK1024-SPLITSVE-NEXT: bl bar
+; CHECK1024-SPLITSVE-NEXT: mov x0, x20
+; CHECK1024-SPLITSVE-NEXT: bl bar
+; CHECK1024-SPLITSVE-NEXT: mov w0, #22647 // =0x5877
+; CHECK1024-SPLITSVE-NEXT: sub x8, x29, #1024
+; CHECK1024-SPLITSVE-NEXT: movk w0, #59491, lsl #16
+; CHECK1024-SPLITSVE-NEXT: addvl sp, x8, #-18
+; CHECK1024-SPLITSVE-NEXT: ldr z23, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: addvl sp, x29, #-2
+; CHECK1024-SPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: mov sp, x29
+; CHECK1024-SPLITSVE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr x28, [sp, #16] // 8-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldp x27, x26, [sp, #32] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ret
entry:
%a = alloca i32, i32 10
%b = alloca <vscale x 4 x i32>
diff --git a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
index 3685e9c..b2635d3 100644
--- a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
@@ -730,6 +730,111 @@ entry:
ret void
}
+define void @store_factor8(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3,
+ <4 x i32> %a4, <4 x i32> %a5, <4 x i32> %a6, <4 x i32> %a7) {
+; CHECK-LABEL: store_factor8:
+; CHECK: .Lfunc_begin17:
+; CHECK-NEXT: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK: zip1 [[V1:.*s]], [[I1:.*s]], [[I5:.*s]]
+; CHECK-NEXT: zip2 [[V5:.*s]], [[I1]], [[I5]]
+; CHECK-NEXT: zip1 [[V2:.*s]], [[I2:.*s]], [[I6:.*s]]
+; CHECK-NEXT: zip2 [[V6:.*s]], [[I2]], [[I6]]
+; CHECK-NEXT: zip1 [[V3:.*s]], [[I3:.*s]], [[I7:.*s]]
+; CHECK-NEXT: zip2 [[V7:.*s]], [[I3]], [[I7]]
+; CHECK-NEXT: zip1 [[V4:.*s]], [[I4:.*s]], [[I8:.*s]]
+; CHECK-NEXT: zip2 [[V8:.*s]], [[I4]], [[I8]]
+; CHECK-NEXT: st4 { [[V1]], [[V2]], [[V3]], [[V4]] }, [x0], #64
+; CHECK-NEXT: st4 { [[V5]], [[V6]], [[V7]], [[V8]] }, [x0]
+; CHECK-NEXT: ret
+
+ %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+ %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+ %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+ store <32 x i32> %interleaved.vec, ptr %ptr, align 4
+ ret void
+}
+
+define void @store_factor16(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3,
+ <4 x i32> %a4, <4 x i32> %a5, <4 x i32> %a6, <4 x i32> %a7,
+ <4 x i32> %a8, <4 x i32> %a9, <4 x i32> %a10, <4 x i32> %a11,
+ <4 x i32> %a12, <4 x i32> %a13, <4 x i32> %a14, <4 x i32> %a15) {
+; CHECK-LABEL: store_factor16:
+; CHECK: .Lfunc_begin18:
+; CHECK-NEXT: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK: zip1 [[V05:.*s]], [[I05:.*s]], [[I13:.*s]]
+; CHECK-NEXT: zip1 [[V01:.*s]], [[I01:.*s]], [[I09:.*s]]
+; CHECK-NEXT: zip1 [[V02:.*s]], [[I02:.*s]], [[I10:.*s]]
+; CHECK-NEXT: zip1 [[V06:.*s]], [[I06:.*s]], [[I14:.*s]]
+; CHECK-NEXT: zip1 [[V07:.*s]], [[I07:.*s]], [[I15:.*s]]
+; CHECK-NEXT: zip2 [[V09:.*s]], [[I01]], [[I09]]
+; CHECK-NEXT: zip2 [[V13:.*s]], [[I05]], [[I13]]
+; CHECK-NEXT: zip1 [[V03:.*s]], [[I03:.*s]], [[I11:.*s]]
+; CHECK-NEXT: zip1 [[V04:.*s]], [[I04:.*s]], [[I12:.*s]]
+; CHECK-NEXT: zip1 [[V08:.*s]], [[I08:.*s]], [[I16:.*s]]
+; CHECK-NEXT: zip2 [[V10:.*s]], [[I02]], [[I10]]
+; CHECK-NEXT: zip2 [[V14:.*s]], [[I06]], [[I14]]
+; CHECK-NEXT: zip2 [[V11:.*s]], [[I03]], [[I11]]
+; CHECK-NEXT: zip1 [[V17:.*s]], [[V01]], [[V05]]
+; CHECK-NEXT: zip2 [[V15:.*s]], [[I07]], [[I15]]
+; CHECK-NEXT: zip2 [[V21:.*s]], [[V01]], [[V05]]
+; CHECK-NEXT: zip1 [[V18:.*s]], [[V02]], [[V06]]
+; CHECK-NEXT: zip2 [[V12:.*s]], [[I04]], [[I12]]
+; CHECK-NEXT: zip2 [[V16:.*s]], [[I08]], [[I16]]
+; CHECK-NEXT: zip1 [[V19:.*s]], [[V03]], [[V07]]
+; CHECK-NEXT: zip2 [[V22:.*s]], [[V02]], [[V06]]
+; CHECK-NEXT: zip1 [[V25:.*s]], [[V09]], [[V13]]
+; CHECK-NEXT: zip1 [[V20:.*s]], [[V04]], [[V08]]
+; CHECK-NEXT: zip2 [[V23:.*s]], [[V03]], [[V07]]
+; CHECK-NEXT: zip1 [[V26:.*s]], [[V10]], [[V14]]
+; CHECK-NEXT: zip2 [[V29:.*s]], [[V09]], [[V13]]
+; CHECK-NEXT: zip2 [[V24:.*s]], [[V04]], [[V08]]
+; CHECK-NEXT: zip1 [[V27:.*s]], [[V11]], [[V15]]
+; CHECK-NEXT: zip2 [[V30:.*s]], [[V10]], [[V14]]
+; CHECK-NEXT: zip1 [[V28:.*s]], [[V12]], [[V16]]
+; CHECK-NEXT: zip2 [[V31:.*s]], [[V11]], [[V15]]
+; CHECK-NEXT: zip2 [[V32:.*s]], [[V12]], [[V16]]
+; CHECK-NEXT: st4 { [[V17]], [[V18]], [[V19]], [[V20]] }, [x8], #64
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: st4 { [[V21]], [[V22]], [[V23]], [[V24]] }, [x8]
+; CHECK-NEXT: add x8, x0, #128
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: st4 { [[V25]], [[V26]], [[V27]], [[V28]] }, [x8]
+; CHECK-NEXT: add x8, x0, #192
+; CHECK-NEXT: st4 { [[V29]], [[V30]], [[V31]], [[V32]] }, [x8]
+; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+
+ %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v4 = shufflevector <4 x i32> %a8, <4 x i32> %a9, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v5 = shufflevector <4 x i32> %a10, <4 x i32> %a11, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v6 = shufflevector <4 x i32> %a12, <4 x i32> %a13, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v7 = shufflevector <4 x i32> %a14, <4 x i32> %a15, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+ %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %s2 = shufflevector <8 x i32> %v4, <8 x i32> %v5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %s3 = shufflevector <8 x i32> %v6, <8 x i32> %v7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+ %d0 = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %d1 = shufflevector <16 x i32> %s2, <16 x i32> %s3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+
+ %interleaved.vec = shufflevector <32 x i32> %d0, <32 x i32> %d1, <64 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+ store <64 x i32> %interleaved.vec, ptr %ptr, align 4
+ ret void
+}
+
declare void @llvm.dbg.value(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index e411c23..7b5621f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -27,11 +27,11 @@ define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) {
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 1
-; GCN-NEXT: v_mov_b32_e32 v0, v0
+; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0
%tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
@@ -68,12 +68,12 @@ define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) {
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 1
; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: v_mov_b32_e32 v0, v0
-; GCN-NEXT: v_mov_b32_e32 v1, v1
+; GCN-NEXT: v_mov_b32_e32 v2, v0
+; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0
%tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
index 1cd9c0b..2351c96 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
@@ -165,10 +165,10 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v3, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v1, v2, 0
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v2, v3, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
@@ -179,15 +179,15 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT: global_load_b32 v0, v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v3, v0, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v1, v0, 0
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v3, 0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 637aaf7..7f10ee4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -546,10 +546,11 @@ define i64 @v_mul_i64(i64 %num, i64 %den) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v4, v0
-; GCN-NEXT: v_mov_b32_e32 v5, v1
-; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2]
-; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GCN-NEXT: v_mov_b32_e32 v5, v2
+; GCN-NEXT: v_mov_b32_e32 v6, v1
+; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0
+; GCN-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2]
+; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i64:
@@ -742,10 +743,10 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0
; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v4, v[0:1]
; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0
-; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v3, v[8:9]
-; GCN-NEXT: v_mov_b32_e32 v2, v8
-; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v4, v[1:2]
-; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[1:2]
+; GCN-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v3, v[8:9]
+; GCN-NEXT: v_mov_b32_e32 v2, v10
+; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v4, v[1:2]
+; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[8:9]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i96:
@@ -758,8 +759,8 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v6, v3, 0
; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v2, v3, v[8:9]
; GFX10-NEXT: v_mov_b32_e32 v2, v8
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v6, v4, v[1:2]
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v3, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v6, v4, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v3, v[8:9]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_mul_i96:
@@ -771,8 +772,8 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v3, 0
; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v2, v3, v[8:9]
; GFX11-NEXT: v_mov_b32_e32 v2, v9
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[1:2]
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v3, v[1:2]
+; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v6, v4, v[1:2]
+; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v3, v[8:9]
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_mul_i96:
@@ -791,8 +792,8 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v2, v3, v[8:9]
; GFX12-NEXT: v_mov_b32_e32 v2, v8
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v6, v4, v[1:2]
-; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v3, v[1:2]
+; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v6, v4, v[1:2]
+; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v3, v[8:9]
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_mul_i96:
@@ -808,10 +809,10 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX1250-NEXT: v_mad_u32 v9, v2, v3, v5
; GFX1250-NEXT: v_mov_b32_e32 v8, v1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[4:5], v6, v4, v[8:9]
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], v7, v3, v[4:5]
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v6, v4, v[8:9]
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[4:5], v7, v3, v[10:11]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3
+; GFX1250-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = mul i96 %num, %den
ret i96 %result
@@ -1071,18 +1072,20 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX7-NEXT: v_mov_b32_e32 v9, v1
; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
; GFX7-NEXT: v_mov_b32_e32 v10, v2
+; GFX7-NEXT: v_mov_b32_e32 v11, v3
+; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v12, v4
+; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0
+; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3]
; GFX7-NEXT: v_mul_lo_u32 v7, v8, v7
-; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
-; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
; GFX7-NEXT: v_mul_lo_u32 v6, v9, v6
-; GFX7-NEXT: v_mov_b32_e32 v2, v11
-; GFX7-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
-; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
-; GFX7-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5]
-; GFX7-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc
-; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
-; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
+; GFX7-NEXT: v_mov_b32_e32 v2, v13
+; GFX7-NEXT: v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2]
+; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4]
+; GFX7-NEXT: v_addc_u32_e64 v3, s[4:5], v14, v7, s[4:5]
+; GFX7-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4]
+; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i128:
@@ -1092,18 +1095,20 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX8-NEXT: v_mov_b32_e32 v9, v1
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
; GFX8-NEXT: v_mov_b32_e32 v10, v2
+; GFX8-NEXT: v_mov_b32_e32 v11, v3
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v12, v4
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0
+; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3]
; GFX8-NEXT: v_mul_lo_u32 v7, v8, v7
-; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
-; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
; GFX8-NEXT: v_mul_lo_u32 v6, v9, v6
-; GFX8-NEXT: v_mov_b32_e32 v2, v11
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
-; GFX8-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5]
-; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
+; GFX8-NEXT: v_mov_b32_e32 v2, v13
+; GFX8-NEXT: v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2]
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4]
+; GFX8-NEXT: v_addc_u32_e64 v3, s[4:5], v14, v7, s[4:5]
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4]
+; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i128:
@@ -1113,18 +1118,20 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX9-NEXT: v_mov_b32_e32 v9, v1
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
; GFX9-NEXT: v_mov_b32_e32 v10, v2
+; GFX9-NEXT: v_mov_b32_e32 v11, v3
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v12, v4
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0
+; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3]
; GFX9-NEXT: v_mul_lo_u32 v7, v8, v7
-; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
-; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
; GFX9-NEXT: v_mul_lo_u32 v6, v9, v6
-; GFX9-NEXT: v_mov_b32_e32 v2, v11
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
-; GFX9-NEXT: v_addc_co_u32_e64 v7, s[4:5], v12, v7, s[4:5]
-; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v6, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
+; GFX9-NEXT: v_mov_b32_e32 v2, v13
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2]
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4]
+; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], v14, v7, s[4:5]
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4]
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i128:
@@ -1138,11 +1145,11 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX10-NEXT: v_mul_lo_u32 v6, v9, v6
; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v9, v5, v[0:1]
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v4, 0
-; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v10, v4, v[11:12]
-; GFX10-NEXT: v_mov_b32_e32 v2, v11
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v9, v4, v[1:2]
-; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v12, v7, s4
+; GFX10-NEXT: v_mad_u64_u32 v[13:14], s4, v10, v4, v[11:12]
+; GFX10-NEXT: v_mov_b32_e32 v2, v13
+; GFX10-NEXT: v_mad_u64_u32 v[11:12], vcc_lo, v8, v5, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v9, v4, v[11:12]
+; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v14, v7, s4
; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo
; GFX10-NEXT: v_mad_u64_u32 v[5:6], s4, v10, v5, v[6:7]
; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v4, v[5:6]
@@ -1155,15 +1162,16 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX11-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v4
; GFX11-NEXT: v_mov_b32_e32 v12, v3
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v6, 0
-; GFX11-NEXT: v_mul_lo_u32 v4, v9, v6
-; GFX11-NEXT: v_mul_lo_u32 v6, v8, v7
+; GFX11-NEXT: v_mul_lo_u32 v7, v8, v7
+; GFX11-NEXT: v_mul_lo_u32 v6, v9, v6
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v9, v5, v[0:1]
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v11, 0
-; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v10, v11, v[2:3]
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v11, v[1:2]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, s0
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v4, vcc_lo
+; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v10, v11, v[2:3]
+; GFX11-NEXT: v_mov_b32_e32 v2, v13
+; GFX11-NEXT: v_mad_u64_u32 v[3:4], vcc_lo, v8, v5, v[1:2]
+; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v11, v[3:4]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v14, v7, s0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, vcc_lo
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v10, v5, v[3:4]
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v12, v11, v[6:7]
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1184,14 +1192,14 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v9, v5, v[0:1]
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v4, 0
-; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v10, v4, v[11:12]
+; GFX12-NEXT: v_mad_co_u64_u32 v[13:14], null, v10, v4, v[11:12]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v2, v11
-; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
+; GFX12-NEXT: v_mov_b32_e32 v2, v13
+; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], vcc_lo, v8, v5, v[1:2]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2]
+; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[11:12]
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v12, v7, s0
+; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v14, v7, s0
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, v7, v6, vcc_lo
@@ -1210,16 +1218,16 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v9, v5, v[0:1]
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v8, v4, 0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v2, v4, v[10:11]
-; GFX1250-NEXT: v_mov_b32_e32 v12, v1
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[12:13], v2, v4, v[10:11]
+; GFX1250-NEXT: v_mov_b32_e32 v10, v1
; GFX1250-NEXT: v_mul_lo_u32 v1, v9, v6
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mov_b32_e32 v13, v10
-; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], vcc_lo, v8, v5, v[12:13]
+; GFX1250-NEXT: v_mov_b32_e32 v11, v12
+; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], vcc_lo, v8, v5, v[10:11]
; GFX1250-NEXT: v_mul_lo_u32 v8, v8, v7
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[12:13]
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v11, v8, s0
+; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[14:15]
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v13, v8, s0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v8, v1, vcc_lo
; GFX1250-NEXT: v_mad_u32 v1, v2, v5, v1
@@ -2401,207 +2409,216 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
-; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
+; GFX7-NEXT: v_mul_lo_u32 v28, v3, v12
+; GFX7-NEXT: v_mul_lo_u32 v27, v5, v10
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0
+; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19]
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21]
; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0
-; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17]
-; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
-; GFX7-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX7-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
-; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21]
-; GFX7-NEXT: v_addc_u32_e32 v23, vcc, 0, v22, vcc
-; GFX7-NEXT: v_mov_b32_e32 v22, v18
-; GFX7-NEXT: v_mov_b32_e32 v18, v19
-; GFX7-NEXT: v_mov_b32_e32 v19, v16
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
-; GFX7-NEXT: v_mul_lo_u32 v16, v6, v9
+; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21]
+; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23]
+; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19]
+; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5]
+; GFX7-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23]
+; GFX7-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19]
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21]
+; GFX7-NEXT: v_addc_u32_e32 v26, vcc, 0, v24, vcc
+; GFX7-NEXT: v_mov_b32_e32 v21, v22
+; GFX7-NEXT: v_mov_b32_e32 v22, v23
+; GFX7-NEXT: v_mov_b32_e32 v23, v18
+; GFX7-NEXT: v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23]
+; GFX7-NEXT: v_mul_lo_u32 v18, v6, v9
; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7]
-; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21]
-; GFX7-NEXT: v_addc_u32_e64 v24, s[4:5], 0, v6, s[4:5]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
-; GFX7-NEXT: v_mul_lo_u32 v26, v4, v11
-; GFX7-NEXT: v_mul_lo_u32 v27, v3, v12
-; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
-; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22]
-; GFX7-NEXT: v_mul_lo_u32 v25, v5, v10
-; GFX7-NEXT: v_mul_lo_u32 v28, v2, v13
-; GFX7-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12]
+; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17]
+; GFX7-NEXT: v_addc_u32_e64 v6, s[4:5], 0, v6, s[4:5]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25]
+; GFX7-NEXT: v_mov_b32_e32 v20, v23
+; GFX7-NEXT: v_mul_lo_u32 v25, v4, v11
+; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21]
+; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24]
+; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[10:11]
+; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17]
+; GFX7-NEXT: v_addc_u32_e64 v24, s[10:11], 0, v23, s[10:11]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21]
+; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12]
; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
-; GFX7-NEXT: v_addc_u32_e64 v22, s[10:11], 0, v6, s[10:11]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19]
-; GFX7-NEXT: v_mov_b32_e32 v21, v20
-; GFX7-NEXT: v_mov_b32_e32 v20, v11
-; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
-; GFX7-NEXT: v_addc_u32_e64 v2, s[10:11], 0, v22, s[10:11]
-; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
-; GFX7-NEXT: v_addc_u32_e64 v11, s[10:11], 0, v2, s[10:11]
+; GFX7-NEXT: v_mov_b32_e32 v12, v22
+; GFX7-NEXT: v_mul_lo_u32 v2, v2, v13
+; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12]
+; GFX7-NEXT: v_addc_u32_e64 v13, s[10:11], 0, v24, s[10:11]
+; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21]
+; GFX7-NEXT: v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17]
+; GFX7-NEXT: v_addc_u32_e64 v16, s[10:11], 0, v13, s[10:11]
; GFX7-NEXT: v_mul_lo_u32 v9, v1, v14
-; GFX7-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17]
-; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21]
-; GFX7-NEXT: v_addc_u32_e64 v3, s[10:11], v12, v3, s[10:11]
+; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23]
+; GFX7-NEXT: v_addc_u32_e64 v3, s[10:11], v3, v4, s[10:11]
; GFX7-NEXT: v_mul_lo_u32 v0, v0, v15
-; GFX7-NEXT: v_addc_u32_e64 v4, s[10:11], v24, v4, s[10:11]
-; GFX7-NEXT: v_addc_u32_e64 v5, s[10:11], v11, v5, s[10:11]
-; GFX7-NEXT: v_addc_u32_e64 v6, s[10:11], v23, v6, s[10:11]
-; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v17, v0, s[10:11]
+; GFX7-NEXT: v_addc_u32_e64 v4, s[10:11], v6, v5, s[10:11]
+; GFX7-NEXT: v_addc_u32_e64 v5, s[10:11], v16, v11, s[10:11]
+; GFX7-NEXT: v_addc_u32_e64 v6, s[10:11], v26, v12, s[10:11]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v19, v0, s[10:11]
; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15]
-; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v28, s[12:13]
-; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v27, s[8:9]
-; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7]
-; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5]
-; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc
-; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v2, s[12:13]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v28, s[8:9]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v25, s[6:7]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5]
+; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v18, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, v10
+; GFX7-NEXT: v_mov_b32_e32 v1, v13
+; GFX7-NEXT: v_mov_b32_e32 v2, v14
+; GFX7-NEXT: v_mov_b32_e32 v7, v11
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i256:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
-; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
+; GFX8-NEXT: v_mul_lo_u32 v28, v3, v12
+; GFX8-NEXT: v_mul_lo_u32 v27, v5, v10
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0
+; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19]
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21]
; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0
-; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17]
-; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
-; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
-; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21]
-; GFX8-NEXT: v_addc_u32_e32 v23, vcc, 0, v22, vcc
-; GFX8-NEXT: v_mov_b32_e32 v22, v18
-; GFX8-NEXT: v_mov_b32_e32 v18, v19
-; GFX8-NEXT: v_mov_b32_e32 v19, v16
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
-; GFX8-NEXT: v_mul_lo_u32 v16, v6, v9
+; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21]
+; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23]
+; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19]
+; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5]
+; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23]
+; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19]
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21]
+; GFX8-NEXT: v_addc_u32_e32 v26, vcc, 0, v24, vcc
+; GFX8-NEXT: v_mov_b32_e32 v21, v22
+; GFX8-NEXT: v_mov_b32_e32 v22, v23
+; GFX8-NEXT: v_mov_b32_e32 v23, v18
+; GFX8-NEXT: v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23]
+; GFX8-NEXT: v_mul_lo_u32 v18, v6, v9
; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7]
-; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21]
-; GFX8-NEXT: v_addc_u32_e64 v24, s[4:5], 0, v6, s[4:5]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
-; GFX8-NEXT: v_mul_lo_u32 v26, v4, v11
-; GFX8-NEXT: v_mul_lo_u32 v27, v3, v12
-; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22]
-; GFX8-NEXT: v_mul_lo_u32 v25, v5, v10
-; GFX8-NEXT: v_mul_lo_u32 v28, v2, v13
-; GFX8-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12]
+; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17]
+; GFX8-NEXT: v_addc_u32_e64 v6, s[4:5], 0, v6, s[4:5]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25]
+; GFX8-NEXT: v_mov_b32_e32 v20, v23
+; GFX8-NEXT: v_mul_lo_u32 v25, v4, v11
+; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21]
+; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24]
+; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[10:11]
+; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17]
+; GFX8-NEXT: v_addc_u32_e64 v24, s[10:11], 0, v23, s[10:11]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21]
+; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12]
; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
-; GFX8-NEXT: v_addc_u32_e64 v22, s[10:11], 0, v6, s[10:11]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19]
-; GFX8-NEXT: v_mov_b32_e32 v21, v20
-; GFX8-NEXT: v_mov_b32_e32 v20, v11
-; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
-; GFX8-NEXT: v_addc_u32_e64 v2, s[10:11], 0, v22, s[10:11]
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
-; GFX8-NEXT: v_addc_u32_e64 v11, s[10:11], 0, v2, s[10:11]
+; GFX8-NEXT: v_mov_b32_e32 v12, v22
+; GFX8-NEXT: v_mul_lo_u32 v2, v2, v13
+; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12]
+; GFX8-NEXT: v_addc_u32_e64 v13, s[10:11], 0, v24, s[10:11]
+; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21]
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17]
+; GFX8-NEXT: v_addc_u32_e64 v16, s[10:11], 0, v13, s[10:11]
; GFX8-NEXT: v_mul_lo_u32 v9, v1, v14
-; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17]
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21]
-; GFX8-NEXT: v_addc_u32_e64 v3, s[10:11], v12, v3, s[10:11]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23]
+; GFX8-NEXT: v_addc_u32_e64 v3, s[10:11], v3, v4, s[10:11]
; GFX8-NEXT: v_mul_lo_u32 v0, v0, v15
-; GFX8-NEXT: v_addc_u32_e64 v4, s[10:11], v24, v4, s[10:11]
-; GFX8-NEXT: v_addc_u32_e64 v5, s[10:11], v11, v5, s[10:11]
-; GFX8-NEXT: v_addc_u32_e64 v6, s[10:11], v23, v6, s[10:11]
-; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v17, v0, s[10:11]
+; GFX8-NEXT: v_addc_u32_e64 v4, s[10:11], v6, v5, s[10:11]
+; GFX8-NEXT: v_addc_u32_e64 v5, s[10:11], v16, v11, s[10:11]
+; GFX8-NEXT: v_addc_u32_e64 v6, s[10:11], v26, v12, s[10:11]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v19, v0, s[10:11]
; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15]
-; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v28, s[12:13]
-; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v27, s[8:9]
-; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7]
-; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5]
-; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v2, s[12:13]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v28, s[8:9]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v25, s[6:7]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5]
+; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v18, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, v10
+; GFX8-NEXT: v_mov_b32_e32 v1, v13
+; GFX8-NEXT: v_mov_b32_e32 v2, v14
+; GFX8-NEXT: v_mov_b32_e32 v7, v11
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i256:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
-; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
+; GFX9-NEXT: v_mul_lo_u32 v28, v3, v12
+; GFX9-NEXT: v_mul_lo_u32 v27, v5, v10
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0
+; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19]
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21]
; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0
-; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17]
-; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
-; GFX9-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v22, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX9-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v22, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
-; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21]
-; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v22, vcc
-; GFX9-NEXT: v_mov_b32_e32 v22, v18
-; GFX9-NEXT: v_mov_b32_e32 v18, v19
-; GFX9-NEXT: v_mov_b32_e32 v19, v16
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
-; GFX9-NEXT: v_mul_lo_u32 v16, v6, v9
+; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21]
+; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23]
+; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19]
+; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5]
+; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v24, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23]
+; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v24, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19]
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21]
+; GFX9-NEXT: v_addc_co_u32_e32 v26, vcc, 0, v24, vcc
+; GFX9-NEXT: v_mov_b32_e32 v21, v22
+; GFX9-NEXT: v_mov_b32_e32 v22, v23
+; GFX9-NEXT: v_mov_b32_e32 v23, v18
+; GFX9-NEXT: v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23]
+; GFX9-NEXT: v_mul_lo_u32 v18, v6, v9
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7]
-; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21]
-; GFX9-NEXT: v_addc_co_u32_e64 v24, s[4:5], 0, v6, s[4:5]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
-; GFX9-NEXT: v_mul_lo_u32 v26, v4, v11
-; GFX9-NEXT: v_mul_lo_u32 v27, v3, v12
-; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22]
-; GFX9-NEXT: v_mul_lo_u32 v25, v5, v10
-; GFX9-NEXT: v_mul_lo_u32 v28, v2, v13
-; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12]
+; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17]
+; GFX9-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, v6, s[4:5]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25]
+; GFX9-NEXT: v_mov_b32_e32 v20, v23
+; GFX9-NEXT: v_mul_lo_u32 v25, v4, v11
+; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21]
+; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24]
+; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[10:11]
+; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17]
+; GFX9-NEXT: v_addc_co_u32_e64 v24, s[10:11], 0, v23, s[10:11]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21]
+; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12]
; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
-; GFX9-NEXT: v_addc_co_u32_e64 v22, s[10:11], 0, v6, s[10:11]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19]
-; GFX9-NEXT: v_mov_b32_e32 v21, v20
-; GFX9-NEXT: v_mov_b32_e32 v20, v11
-; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
-; GFX9-NEXT: v_addc_co_u32_e64 v2, s[10:11], 0, v22, s[10:11]
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
-; GFX9-NEXT: v_addc_co_u32_e64 v11, s[10:11], 0, v2, s[10:11]
+; GFX9-NEXT: v_mov_b32_e32 v12, v22
+; GFX9-NEXT: v_mul_lo_u32 v2, v2, v13
+; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12]
+; GFX9-NEXT: v_addc_co_u32_e64 v13, s[10:11], 0, v24, s[10:11]
+; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21]
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17]
+; GFX9-NEXT: v_addc_co_u32_e64 v16, s[10:11], 0, v13, s[10:11]
; GFX9-NEXT: v_mul_lo_u32 v9, v1, v14
-; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17]
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21]
-; GFX9-NEXT: v_addc_co_u32_e64 v3, s[10:11], v12, v3, s[10:11]
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23]
+; GFX9-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v4, s[10:11]
; GFX9-NEXT: v_mul_lo_u32 v0, v0, v15
-; GFX9-NEXT: v_addc_co_u32_e64 v4, s[10:11], v24, v4, s[10:11]
-; GFX9-NEXT: v_addc_co_u32_e64 v5, s[10:11], v11, v5, s[10:11]
-; GFX9-NEXT: v_addc_co_u32_e64 v6, s[10:11], v23, v6, s[10:11]
-; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v17, v0, s[10:11]
+; GFX9-NEXT: v_addc_co_u32_e64 v4, s[10:11], v6, v5, s[10:11]
+; GFX9-NEXT: v_addc_co_u32_e64 v5, s[10:11], v16, v11, s[10:11]
+; GFX9-NEXT: v_addc_co_u32_e64 v6, s[10:11], v26, v12, s[10:11]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v19, v0, s[10:11]
; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v9, s[14:15]
-; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v28, s[12:13]
-; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v27, s[8:9]
-; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v26, s[6:7]
-; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v25, s[4:5]
-; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v16, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v2, s[12:13]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v28, s[8:9]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v25, s[6:7]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v27, s[4:5]
+; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v18, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, v10
+; GFX9-NEXT: v_mov_b32_e32 v1, v13
+; GFX9-NEXT: v_mov_b32_e32 v2, v14
+; GFX9-NEXT: v_mov_b32_e32 v7, v11
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i256:
@@ -2609,68 +2626,69 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v16, v0
; GFX10-NEXT: v_mov_b32_e32 v17, v1
-; GFX10-NEXT: v_mul_lo_u32 v27, v6, v9
-; GFX10-NEXT: v_mul_lo_u32 v28, v5, v10
+; GFX10-NEXT: v_mul_lo_u32 v29, v4, v11
+; GFX10-NEXT: v_mul_lo_u32 v31, v3, v12
+; GFX10-NEXT: v_mul_lo_u32 v30, v2, v13
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v14, 0
-; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v16, v12, 0
-; GFX10-NEXT: v_mul_lo_u32 v30, v17, v14
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v13, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v2, v12, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v11, v[18:19]
-; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s4
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v3, v11, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
-; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
-; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v16, v10, 0
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v10, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
-; GFX10-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v5, v9, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
-; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
-; GFX10-NEXT: v_mad_u64_u32 v[22:23], s4, v6, v8, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[20:21]
-; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
-; GFX10-NEXT: v_mov_b32_e32 v20, v22
-; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
-; GFX10-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v13, v[19:20]
-; GFX10-NEXT: v_mov_b32_e32 v20, v18
+; GFX10-NEXT: v_mul_lo_u32 v28, v17, v14
+; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v13, v[0:1]
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v12, 0
+; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v2, v12, v[18:19]
+; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v11, v[0:1]
+; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, 1, s4
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v3, v11, v[20:21]
+; GFX10-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19]
+; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v22, vcc_lo
+; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v4, v10, v[0:1]
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21]
+; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v22, vcc_lo
+; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v5, v9, v[18:19]
+; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v16, v10, 0
+; GFX10-NEXT: v_mad_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1]
+; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v26, vcc_lo
+; GFX10-NEXT: v_mad_u64_u32 v[24:25], s4, v6, v8, v[20:21]
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[18:19]
+; GFX10-NEXT: v_mov_b32_e32 v18, v23
+; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4
+; GFX10-NEXT: v_mul_lo_u32 v23, v6, v9
+; GFX10-NEXT: v_mov_b32_e32 v19, v24
+; GFX10-NEXT: v_mul_lo_u32 v24, v5, v10
+; GFX10-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v8, v[0:1]
+; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v27, vcc_lo
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v13, v[18:19]
; GFX10-NEXT: v_mov_b32_e32 v19, v22
-; GFX10-NEXT: v_mul_lo_u32 v22, v16, v15
-; GFX10-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v16, v11, v[19:20]
+; GFX10-NEXT: v_mul_lo_u32 v27, v16, v15
+; GFX10-NEXT: v_mov_b32_e32 v18, v21
+; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1]
+; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v16, v11, v[18:19]
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v16, v8, 0
-; GFX10-NEXT: v_mul_lo_u32 v20, v4, v11
-; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6
-; GFX10-NEXT: v_mad_u64_u32 v[18:19], s5, v2, v11, v[24:25]
-; GFX10-NEXT: v_mul_lo_u32 v25, v3, v12
+; GFX10-NEXT: v_cndmask_b32_e64 v32, 0, 1, s6
+; GFX10-NEXT: v_mad_u64_u32 v[18:19], s5, v2, v11, v[21:22]
; GFX10-NEXT: v_mad_u64_u32 v[11:12], s6, v17, v10, v[14:15]
-; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6
-; GFX10-NEXT: v_mul_lo_u32 v24, v2, v13
-; GFX10-NEXT: v_mad_u64_u32 v[18:19], s7, v3, v10, v[18:19]
; GFX10-NEXT: v_mov_b32_e32 v13, v1
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s6, v2, v9, v[11:12]
-; GFX10-NEXT: v_mov_b32_e32 v14, v21
-; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6
-; GFX10-NEXT: v_mad_u64_u32 v[10:11], s6, v4, v9, v[18:19]
-; GFX10-NEXT: v_mad_u64_u32 v[12:13], s8, v16, v9, v[13:14]
-; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s8
-; GFX10-NEXT: v_mad_u64_u32 v[3:4], s8, v3, v8, v[1:2]
-; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s8, 0, v6, s8
-; GFX10-NEXT: v_mad_u64_u32 v[5:6], s8, v5, v8, v[10:11]
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[12:13]
-; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v9, v3, s9
-; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v29, v4, s9
-; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v14, v5, s9
-; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v26, v6, s9
-; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v23, v22, s9
-; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v30, s8
-; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v24, s6
-; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v25, s7
-; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v20, s5
-; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
-; GFX10-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v27, s4
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v32, s6
+; GFX10-NEXT: v_mov_b32_e32 v14, v20
+; GFX10-NEXT: v_mad_u64_u32 v[21:22], s7, v3, v10, v[18:19]
+; GFX10-NEXT: v_mad_u64_u32 v[18:19], s6, v2, v9, v[11:12]
+; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s6, 0, v1, s6
+; GFX10-NEXT: v_mad_u64_u32 v[10:11], s8, v16, v9, v[13:14]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s6, v4, v9, v[21:22]
+; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s8
+; GFX10-NEXT: v_mad_u64_u32 v[12:13], s8, v3, v8, v[18:19]
+; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, 0, v15, s8
+; GFX10-NEXT: v_mad_u64_u32 v[14:15], s8, v5, v8, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[10:11]
+; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v4, v12, s9
+; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v6, v13, s9
+; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v9, v14, s9
+; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v26, v15, s9
+; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v25, v27, s9
+; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v28, s8
+; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v30, s6
+; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v31, s7
+; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v29, s5
+; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v24, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v23, s4
; GFX10-NEXT: v_mad_u64_u32 v[7:8], s4, v7, v8, v[9:10]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -2681,66 +2699,65 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX11-NEXT: v_dual_mov_b32 v18, v8 :: v_dual_mov_b32 v19, v7
; GFX11-NEXT: v_mul_lo_u32 v30, v4, v11
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v14, 0
-; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v16, v12, 0
+; GFX11-NEXT: v_mul_lo_u32 v28, v16, v15
; GFX11-NEXT: v_mul_lo_u32 v29, v17, v14
-; GFX11-NEXT: v_mul_lo_u32 v28, v5, v10
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v17, v13, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[7:8], s0, v17, v11, v[7:8]
-; GFX11-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v12, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v2, v10, v[7:8]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
-; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v16, v10, 0
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v11, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v3, v9, v[7:8]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v10, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v4, v18, v[7:8]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v24, vcc_lo
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v9, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[22:23], null, v6, v18, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[20:21]
-; GFX11-NEXT: v_mov_b32_e32 v20, v8
-; GFX11-NEXT: v_cndmask_b32_e64 v26, 0, 1, s0
-; GFX11-NEXT: v_mov_b32_e32 v21, v22
-; GFX11-NEXT: v_mul_lo_u32 v22, v6, v9
-; GFX11-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v2, v18, v[0:1]
+; GFX11-NEXT: v_mul_lo_u32 v32, v3, v12
+; GFX11-NEXT: v_mul_lo_u32 v31, v2, v13
+; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v17, v13, v[0:1]
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v12, 0
+; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v2, v12, v[7:8]
+; GFX11-NEXT: v_mad_u64_u32 v[7:8], s0, v17, v11, v[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v11, v[20:21]
+; GFX11-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v10, v[7:8]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo
+; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v4, v10, v[0:1]
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo
+; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v5, v9, v[7:8]
+; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v16, v10, 0
+; GFX11-NEXT: v_mad_u64_u32 v[22:23], vcc_lo, v4, v18, v[0:1]
; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v16, v13, v[20:21]
-; GFX11-NEXT: v_mov_b32_e32 v6, v25
-; GFX11-NEXT: v_mul_lo_u32 v25, v16, v15
-; GFX11-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v17, v12, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v16, v11, v[6:7]
+; GFX11-NEXT: v_mad_u64_u32 v[24:25], null, v6, v18, v[20:21]
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[7:8]
+; GFX11-NEXT: v_mov_b32_e32 v7, v23
+; GFX11-NEXT: v_cndmask_b32_e64 v27, 0, 1, s0
+; GFX11-NEXT: v_mul_lo_u32 v23, v6, v9
+; GFX11-NEXT: v_mov_b32_e32 v8, v24
+; GFX11-NEXT: v_mul_lo_u32 v24, v5, v10
+; GFX11-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v18, v[0:1]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v16, v13, v[7:8]
+; GFX11-NEXT: v_dual_mov_b32 v7, v22 :: v_dual_mov_b32 v6, v21
+; GFX11-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1]
+; GFX11-NEXT: v_mad_u64_u32 v[14:15], s2, v16, v11, v[6:7]
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v18, 0
; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s2
-; GFX11-NEXT: v_mad_u64_u32 v[14:15], s1, v2, v11, v[20:21]
-; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v17, v10, v[6:7]
-; GFX11-NEXT: v_mul_lo_u32 v20, v2, v13
-; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, 0, v8, s2
-; GFX11-NEXT: v_mov_b32_e32 v11, v1
-; GFX11-NEXT: v_mad_u64_u32 v[13:14], s3, v3, v10, v[14:15]
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], s2, v2, v9, v[6:7]
-; GFX11-NEXT: v_mul_lo_u32 v21, v3, v12
-; GFX11-NEXT: v_mov_b32_e32 v12, v24
-; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v8, s2
-; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v4, v9, v[13:14]
-; GFX11-NEXT: v_mad_u64_u32 v[8:9], s4, v16, v9, v[11:12]
-; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, 1, s4
-; GFX11-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v18, v[1:2]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v10, s4
-; GFX11-NEXT: v_mad_u64_u32 v[5:6], s4, v5, v18, v[6:7]
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v18, v[8:9]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v11, v3, s5
-; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v26, v4, s5
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v10, v5, s5
-; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v27, v6, s5
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v23, v25, s5
+; GFX11-NEXT: v_mad_u64_u32 v[6:7], s1, v2, v11, v[21:22]
+; GFX11-NEXT: v_mad_u64_u32 v[11:12], s2, v17, v10, v[14:15]
+; GFX11-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v20
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v8, s2
+; GFX11-NEXT: v_mad_u64_u32 v[21:22], s3, v3, v10, v[6:7]
+; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v2, v9, v[11:12]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v1, s2
+; GFX11-NEXT: v_mad_u64_u32 v[10:11], s4, v16, v9, v[13:14]
+; GFX11-NEXT: v_mad_u64_u32 v[1:2], s2, v4, v9, v[21:22]
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4
+; GFX11-NEXT: v_mad_u64_u32 v[8:9], s4, v3, v18, v[6:7]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v12, s4
+; GFX11-NEXT: v_mad_u64_u32 v[6:7], s4, v5, v18, v[1:2]
+; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v18, v[10:11]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v4, v8, s5
+; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v27, v9, s5
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v12, v6, s5
+; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v7, s5
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v25, v28, s5
; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v29, s4
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v20, s2
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v21, s3
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v31, s2
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v32, s3
; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v30, s1
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v28, vcc_lo
-; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, v7, v22, s0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v24, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, v7, v23, s0
; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v19, v18, v[9:10]
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
@@ -2752,101 +2769,103 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
-; GFX12-NEXT: v_mul_lo_u32 v27, v6, v9
-; GFX12-NEXT: v_mul_lo_u32 v28, v5, v10
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_mul_lo_u32 v29, v4, v11
+; GFX12-NEXT: v_mul_lo_u32 v31, v3, v12
+; GFX12-NEXT: v_mul_lo_u32 v30, v2, v13
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v14, 0
-; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v12, 0
-; GFX12-NEXT: v_mul_lo_u32 v30, v17, v14
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
+; GFX12-NEXT: v_mul_lo_u32 v28, v17, v14
+; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v17, v13, v[0:1]
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v12, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v2, v12, v[18:19]
+; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[0:1]
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
-; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v16, v10, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1]
-; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1]
-; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[20:21]
+; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19]
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1]
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0
-; GFX12-NEXT: v_mov_b32_e32 v20, v22
+; GFX12-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo
+; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v4, v10, v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21]
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v25, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20]
+; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v5, v9, v[18:19]
+; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v10, 0
+; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], null, v6, v8, v[20:21]
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[18:19]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT: v_mov_b32_e32 v18, v23
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: v_cndmask_b32_e64 v27, 0, 1, s0
+; GFX12-NEXT: v_mul_lo_u32 v23, v6, v9
+; GFX12-NEXT: v_mov_b32_e32 v19, v24
+; GFX12-NEXT: v_mul_lo_u32 v24, v5, v10
+; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v8, v[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v27, vcc_lo
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[18:19]
; GFX12-NEXT: v_mov_b32_e32 v19, v22
-; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
+; GFX12-NEXT: v_mul_lo_u32 v27, v16, v15
+; GFX12-NEXT: v_mov_b32_e32 v18, v21
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1]
+; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[18:19]
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v8, 0
-; GFX12-NEXT: v_mov_b32_e32 v20, v18
-; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[19:20]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[24:25]
-; GFX12-NEXT: v_mul_lo_u32 v20, v4, v11
-; GFX12-NEXT: v_mul_lo_u32 v25, v3, v12
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2
-; GFX12-NEXT: v_mul_lo_u32 v24, v2, v13
+; GFX12-NEXT: v_cndmask_b32_e64 v32, 0, 1, s2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[21:22]
; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s2, v17, v10, v[14:15]
-; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s3, v3, v10, v[18:19]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v20
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s2
-; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v21
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v32, s2
+; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], s3, v3, v10, v[18:19]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12]
+; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s2, v2, v9, v[11:12]
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s2
-; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s2, v4, v9, v[18:19]
-; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], s4, v16, v9, v[13:14]
+; GFX12-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v1, s2
+; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s4, v16, v9, v[13:14]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v4, v9, v[21:22]
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4
-; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s4, v3, v8, v[1:2]
+; GFX12-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4
+; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], s4, v3, v8, v[18:19]
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v14, null, 0, v6, s4
-; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11]
-; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13]
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v15, s4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s4, v5, v8, v[1:2]
+; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[10:11]
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v4, v12, s5
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5
+; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v6, v13, s5
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v14, v5, s5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v9, v14, s5
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v6, s5
+; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v15, s5
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v23, v22, s5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v30, s4
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v24, s2
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v25, v27, s5
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, s4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v25, s3
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v20, s1
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v30, s2
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v31, s3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v29, s1
; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v24, vcc_lo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, vcc_lo
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v27, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v23, s0
; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10]
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
@@ -2855,87 +2874,89 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
-; GFX1250-NEXT: v_mul_lo_u32 v27, v5, v10
-; GFX1250-NEXT: v_mul_lo_u32 v29, v3, v12
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mul_lo_u32 v30, v4, v11
+; GFX1250-NEXT: v_mul_lo_u32 v29, v5, v10
+; GFX1250-NEXT: v_mul_lo_u32 v31, v3, v12
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v14, 0
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v16, v12, 0
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v17, v13, v[0:1]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
-; GFX1250-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v2, v12, v[0:1]
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v16, v10, 0
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v3, v11, v[0:1]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
+; GFX1250-NEXT: v_mul_lo_u32 v32, v2, v13
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v17, v13, v[0:1]
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v12, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v2, v12, v[18:19]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v3, v11, v[20:21]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19]
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v4, v10, v[0:1]
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v4, v10, v[0:1]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v5, v9, v[0:1]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[22:23], v6, v8, v[0:1]
-; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
-; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
-; GFX1250-NEXT: v_mul_lo_u32 v22, v6, v9
-; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v5, v9, v[18:19]
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v16, v10, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1]
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v28, null, 0, v26, vcc_lo
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[0:1]
-; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[20:21]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[24:25], v6, v8, v[20:21]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[18:19]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_dual_mov_b32 v18, v23 :: v_dual_mov_b32 v19, v24
+; GFX1250-NEXT: v_mul_lo_u32 v24, v6, v9
+; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[18:19]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v2, v8, v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s0
; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v12, v[20:21]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_dual_mov_b32 v20, v25 :: v_dual_mov_b32 v21, v18
-; GFX1250-NEXT: v_mul_lo_u32 v25, v4, v11
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v16, v11, v[20:21]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_cndmask_b32_e64 v28, 0, 1, s2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
+; GFX1250-NEXT: v_mov_b32_e32 v13, v18
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], s2, v16, v11, v[20:21]
; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[0:1]
+; GFX1250-NEXT: v_cndmask_b32_e64 v11, 0, 1, s2
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v8, 0
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v17, v10, v[18:19]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21]
-; GFX1250-NEXT: v_mul_lo_u32 v20, v2, v13
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v28, s2
-; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19]
-; GFX1250-NEXT: v_dual_mov_b32 v18, v1 :: v_dual_mov_b32 v19, v24
-; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v21, s2
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v16, v9, v[18:19]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[26:27], s2, v17, v10, v[22:23]
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v33, null, 0, v11, s2
+; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], s3, v3, v10, v[20:21]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_mov_b32_e32 v12, v1
+; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s2, v2, v9, v[26:27]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s6, v16, v9, v[12:13]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s4, v4, v9, v[22:23]
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v33, s2
; GFX1250-NEXT: v_mul_lo_u32 v2, v16, v15
-; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13]
-; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
; GFX1250-NEXT: v_mul_lo_u32 v9, v17, v14
-; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[10:11]
+; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s2
-; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[18:19]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[18:19]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[20:21]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v3, s2, v3, v12, s2
; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v6, v13, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v1, v10, s2
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v26, v11, s2
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v28, v11, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v23, v2, s2
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v25, v2, s2
; GFX1250-NEXT: v_mov_b32_e32 v2, v15
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v9, s5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v20, s4
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v29, s3
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v32, s4
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v31, s3
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v25, s1
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v27, s0
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v30, s1
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v29, s0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v22, vcc_lo
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v24, vcc_lo
; GFX1250-NEXT: v_mad_u32 v7, v7, v8, v1
; GFX1250-NEXT: v_mov_b32_e32 v1, v14
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -2949,60 +2970,60 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b64 s[0:1], 0
-; GFX7-NEXT: buffer_load_dword v2, v[2:3], s[0:3], 0 addr64
-; GFX7-NEXT: v_mov_b32_e32 v3, 0x50
+; GFX7-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64
+; GFX7-NEXT: v_mov_b32_e32 v5, 0x50
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v3, 0
+; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: s_mul_u64_zext_with_vregs:
; GFX8: ; %bb.0:
-; GFX8-NEXT: flat_load_dword v2, v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v3, 0x50
+; GFX8-NEXT: flat_load_dword v4, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x50
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_u64_zext_with_vregs:
; GFX9: ; %bb.0:
-; GFX9-NEXT: global_load_dword v2, v[2:3], off
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x50
+; GFX9-NEXT: global_load_dword v4, v[2:3], off
+; GFX9-NEXT: v_mov_b32_e32 v5, 0x50
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_mul_u64_zext_with_vregs:
; GFX10: ; %bb.0:
-; GFX10-NEXT: global_load_dword v2, v[2:3], off
+; GFX10-NEXT: global_load_dword v4, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, 0x50, v2, 0
+; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, 0x50, v4, 0
; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_mul_u64_zext_with_vregs:
; GFX11: ; %bb.0:
-; GFX11-NEXT: global_load_b32 v2, v[2:3], off
+; GFX11-NEXT: global_load_b32 v4, v[2:3], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v2, 0
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v4, 0
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: s_mul_u64_zext_with_vregs:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_load_b32 v2, v[2:3], off
+; GFX12-NEXT: global_load_b32 v4, v[2:3], off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0
+; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v4, 0
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX12-NEXT: s_endpgm
;
; GFX1250-LABEL: s_mul_u64_zext_with_vregs:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: global_load_b32 v2, v[2:3], off
+; GFX1250-NEXT: global_load_b32 v4, v[2:3], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0x50, v2, 0
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0x50, v4, 0
; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX1250-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %in, align 4
@@ -3130,33 +3151,36 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64
-; GFX7-NEXT: v_mov_b32_e32 v5, 0x50
+; GFX7-NEXT: v_mov_b32_e32 v6, 0x50
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
-; GFX7-NEXT: v_ashrrev_i32_e32 v4, 31, v4
-; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v5, v[3:4]
+; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v6, 0
+; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v4
+; GFX7-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v6, v[3:4]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: s_mul_u64_sext_with_vregs:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_dword v4, v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v5, 0x50
+; GFX8-NEXT: v_mov_b32_e32 v6, 0x50
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
-; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v4
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4]
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0
+; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v4
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_u64_sext_with_vregs:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dword v4, v[2:3], off
-; GFX9-NEXT: v_mov_b32_e32 v5, 0x50
+; GFX9-NEXT: v_mov_b32_e32 v6, 0x50
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
-; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v4
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4]
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0
+; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v4
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4]
+; GFX9-NEXT: v_mov_b32_e32 v3, v4
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT: s_endpgm
;
@@ -3183,17 +3207,17 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
;
; GFX12-LABEL: s_mul_u64_sext_with_vregs:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_load_b32 v2, v[2:3], off
+; GFX12-NEXT: global_load_b32 v4, v[2:3], off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0
+; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v4, 0
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX12-NEXT: s_endpgm
;
; GFX1250-LABEL: s_mul_u64_sext_with_vregs:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: global_load_b32 v2, v[2:3], off
+; GFX1250-NEXT: global_load_b32 v4, v[2:3], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], 0x50, v2, 0
+; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], 0x50, v4, 0
; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX1250-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %in, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 4f2c454..01c601f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -31,128 +31,128 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) {
; CHECK-NEXT: v_xor_b32_e32 v1, v3, v0
; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v2
; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1
-; CHECK-NEXT: v_sub_i32_e32 v10, vcc, 0, v2
-; CHECK-NEXT: v_subb_u32_e32 v11, vcc, 0, v1, vcc
+; CHECK-NEXT: v_sub_i32_e32 v12, vcc, 0, v2
+; CHECK-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc
; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v6
; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3
; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v3
; CHECK-NEXT: v_trunc_f32_e32 v8, v6
; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v8
-; CHECK-NEXT: v_cvt_u32_f32_e32 v9, v3
-; CHECK-NEXT: v_cvt_u32_f32_e32 v12, v8
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0
+; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v3
+; CHECK-NEXT: v_cvt_u32_f32_e32 v14, v8
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v11, 0
; CHECK-NEXT: v_mov_b32_e32 v3, v7
-; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4]
-; CHECK-NEXT: v_mul_lo_u32 v3, v12, v6
-; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; CHECK-NEXT: v_mul_hi_u32 v8, v9, v6
-; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6
-; CHECK-NEXT: v_mul_lo_u32 v13, v9, v7
-; CHECK-NEXT: v_mul_lo_u32 v14, v12, v7
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v13
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[3:4]
+; CHECK-NEXT: v_mul_lo_u32 v3, v14, v6
+; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8]
+; CHECK-NEXT: v_mul_hi_u32 v7, v11, v6
+; CHECK-NEXT: v_mul_hi_u32 v6, v14, v6
+; CHECK-NEXT: v_mul_lo_u32 v8, v11, v9
+; CHECK-NEXT: v_mul_lo_u32 v10, v14, v9
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8
-; CHECK-NEXT: v_mul_hi_u32 v8, v9, v7
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7
+; CHECK-NEXT: v_mul_hi_u32 v7, v11, v9
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v13, v3
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v14, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v10, v6
; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v13, v8
-; CHECK-NEXT: v_mul_hi_u32 v7, v12, v7
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT: v_mul_hi_u32 v8, v14, v9
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6
; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v3
-; CHECK-NEXT: v_addc_u32_e32 v12, vcc, v12, v6, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v3
+; CHECK-NEXT: v_addc_u32_e32 v14, vcc, v14, v6, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v11, 0
; CHECK-NEXT: v_mov_b32_e32 v3, v7
-; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4]
-; CHECK-NEXT: v_ashrrev_i32_e32 v10, 31, v5
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v10
-; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v10, vcc
-; CHECK-NEXT: v_xor_b32_e32 v8, v3, v10
-; CHECK-NEXT: v_mul_lo_u32 v3, v12, v6
-; CHECK-NEXT: v_mul_lo_u32 v5, v9, v7
-; CHECK-NEXT: v_xor_b32_e32 v11, v4, v10
-; CHECK-NEXT: v_mul_hi_u32 v4, v9, v6
-; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6
+; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[3:4]
+; CHECK-NEXT: v_ashrrev_i32_e32 v12, 31, v5
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v12
+; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8]
+; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v12, vcc
+; CHECK-NEXT: v_xor_b32_e32 v10, v3, v12
+; CHECK-NEXT: v_mul_lo_u32 v3, v14, v6
+; CHECK-NEXT: v_mul_lo_u32 v5, v11, v9
+; CHECK-NEXT: v_xor_b32_e32 v13, v4, v12
+; CHECK-NEXT: v_mul_hi_u32 v4, v11, v6
+; CHECK-NEXT: v_mul_hi_u32 v6, v14, v6
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v4, v12, v7
+; CHECK-NEXT: v_mul_lo_u32 v4, v14, v9
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
-; CHECK-NEXT: v_mul_hi_u32 v5, v9, v7
+; CHECK-NEXT: v_mul_hi_u32 v5, v11, v9
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_mul_hi_u32 v6, v12, v7
+; CHECK-NEXT: v_mul_hi_u32 v6, v14, v9
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v9, v3
-; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc
-; CHECK-NEXT: v_mul_lo_u32 v5, v11, v3
-; CHECK-NEXT: v_mul_lo_u32 v6, v8, v4
-; CHECK-NEXT: v_mul_hi_u32 v7, v8, v3
-; CHECK-NEXT: v_mul_hi_u32 v3, v11, v3
-; CHECK-NEXT: v_mul_hi_u32 v9, v11, v4
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v11, v3
+; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc
+; CHECK-NEXT: v_mul_lo_u32 v5, v13, v3
+; CHECK-NEXT: v_mul_lo_u32 v6, v10, v4
+; CHECK-NEXT: v_mul_hi_u32 v7, v10, v3
+; CHECK-NEXT: v_mul_hi_u32 v3, v13, v3
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v7, v11, v4
+; CHECK-NEXT: v_mul_lo_u32 v7, v13, v4
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_mul_hi_u32 v6, v8, v4
+; CHECK-NEXT: v_mul_hi_u32 v6, v10, v4
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v3, v5
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v7, 0
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v3, v5
+; CHECK-NEXT: v_mul_hi_u32 v7, v13, v4
+; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v9, 0
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v9, v5
-; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v8, v3
-; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v7, v[4:5]
-; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v11, v4, vcc
-; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v11, v4
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1
-; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v7, v5
+; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[4:5]
+; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v10, v3
+; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v9, v[5:6]
+; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v7, vcc
+; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v7
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2
; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v1
-; CHECK-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5]
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, 1, v7
-; CHECK-NEXT: v_addc_u32_e32 v9, vcc, 0, v6, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1
-; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v9
+; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v11, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v2, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v8
-; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v9, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v3, v10, v0
-; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v6
+; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v7, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
+; CHECK-NEXT: v_xor_b32_e32 v3, v12, v0
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3
; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
@@ -220,65 +220,65 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; CHECK-NEXT: v_trunc_f32_e32 v2, v1
; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v0
-; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v2
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT: v_mul_hi_u32 v5, v3, v0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
-; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT: v_mul_lo_u32 v6, v3, v1
-; CHECK-NEXT: v_mul_lo_u32 v7, v4, v1
-; CHECK-NEXT: v_mul_hi_u32 v8, v3, v1
-; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v0
+; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v2
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2]
+; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3]
+; CHECK-NEXT: v_mul_hi_u32 v2, v6, v0
+; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0
+; CHECK-NEXT: v_mul_lo_u32 v3, v6, v4
+; CHECK-NEXT: v_mul_lo_u32 v5, v7, v4
+; CHECK-NEXT: v_mul_hi_u32 v8, v6, v4
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v0
+; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2]
+; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3]
+; CHECK-NEXT: v_mul_hi_u32 v3, v6, v0
+; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0
+; CHECK-NEXT: v_mul_lo_u32 v2, v6, v4
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v0
-; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT: v_mul_hi_u32 v6, v3, v0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
-; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT: v_mul_lo_u32 v5, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v6, v4, v1
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT: v_mul_hi_u32 v5, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v3, v7, v4
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT: v_mul_hi_u32 v2, v6, v4
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
; CHECK-NEXT: v_mul_lo_u32 v2, s13, v0
; CHECK-NEXT: v_mul_lo_u32 v3, s12, v1
; CHECK-NEXT: v_mul_hi_u32 v4, s12, v0
; CHECK-NEXT: v_mul_hi_u32 v0, s13, v0
-; CHECK-NEXT: v_mul_hi_u32 v5, s13, v1
+; CHECK-NEXT: v_mov_b32_e32 v7, s13
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
@@ -291,39 +291,39 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v0, v2
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v0, v2
+; CHECK-NEXT: v_mul_hi_u32 v4, s13, v1
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v2, v[1:2]
-; CHECK-NEXT: v_mov_b32_e32 v5, s13
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v2
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v4, v[1:2]
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, s12, v0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2]
-; CHECK-NEXT: v_mov_b32_e32 v3, s11
-; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v5, v1, vcc
-; CHECK-NEXT: v_sub_i32_e64 v1, s[0:1], s13, v1
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3]
+; CHECK-NEXT: v_mov_b32_e32 v1, s11
+; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v7, v4, vcc
+; CHECK-NEXT: v_sub_i32_e64 v3, s[0:1], s13, v4
; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v2
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v0
; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s10, v0
; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v4
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v2
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s11, v1
-; CHECK-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1]
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v2, v4, v5, s[0:1]
+; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s10, v0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s11, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v3
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
; CHECK-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9]
; CHECK-NEXT: v_xor_b32_e32 v0, s0, v0
; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0
@@ -382,263 +382,263 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_xor_b32_e32 v4, v5, v8
; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v10
; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v4
-; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v10
-; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v4, vcc
+; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v10
+; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v4, vcc
; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v9
; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v5
; GISEL-NEXT: v_trunc_f32_e32 v9, v9
; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9
-; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v5
; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0
; GISEL-NEXT: v_mov_b32_e32 v5, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v9, v[5:6]
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v9, v[5:6]
; GISEL-NEXT: v_mul_lo_u32 v5, v9, v11
-; GISEL-NEXT: v_mul_hi_u32 v17, v14, v11
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
+; GISEL-NEXT: v_mul_hi_u32 v13, v16, v11
; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11
-; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12
+; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v17
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v17, v9, v12
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5
-; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v17, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT: v_mul_lo_u32 v13, v9, v14
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT: v_mul_hi_u32 v12, v16, v14
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13
-; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_mul_hi_u32 v13, v9, v14
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v5
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v11, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v5
+; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v11, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0
; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
; GISEL-NEXT: v_mov_b32_e32 v5, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[5:6]
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[5:6]
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
-; GISEL-NEXT: v_xor_b32_e32 v15, v0, v9
-; GISEL-NEXT: v_mul_lo_u32 v0, v17, v11
-; GISEL-NEXT: v_mul_lo_u32 v5, v14, v12
-; GISEL-NEXT: v_xor_b32_e32 v16, v1, v9
-; GISEL-NEXT: v_mul_hi_u32 v1, v14, v11
-; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
+; GISEL-NEXT: v_xor_b32_e32 v17, v0, v9
+; GISEL-NEXT: v_mul_lo_u32 v0, v19, v11
+; GISEL-NEXT: v_mul_lo_u32 v5, v16, v14
+; GISEL-NEXT: v_xor_b32_e32 v18, v1, v9
+; GISEL-NEXT: v_mul_hi_u32 v1, v16, v11
+; GISEL-NEXT: v_mul_hi_u32 v11, v19, v11
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12
+; GISEL-NEXT: v_mul_lo_u32 v1, v19, v14
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT: v_mul_hi_u32 v5, v14, v12
+; GISEL-NEXT: v_mul_hi_u32 v5, v16, v14
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5
-; GISEL-NEXT: v_mul_hi_u32 v11, v17, v12
+; GISEL-NEXT: v_mul_hi_u32 v11, v19, v14
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v16, v0
-; GISEL-NEXT: v_mul_lo_u32 v11, v15, v1
-; GISEL-NEXT: v_mul_hi_u32 v12, v15, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0
-; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v5, v18, v0
+; GISEL-NEXT: v_mul_lo_u32 v11, v17, v1
+; GISEL-NEXT: v_mul_hi_u32 v12, v17, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v12, v16, v1
+; GISEL-NEXT: v_mul_lo_u32 v12, v18, v1
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5
-; GISEL-NEXT: v_mul_hi_u32 v11, v15, v1
+; GISEL-NEXT: v_mul_hi_u32 v11, v17, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v12, v11
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
-; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1
+; GISEL-NEXT: v_mul_hi_u32 v1, v18, v1
; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v0, 0
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v1, v5
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v1, v5
; GISEL-NEXT: v_mov_b32_e32 v1, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v10, v14, v[1:2]
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v10, v16, v[1:2]
; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v7
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v5
; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v0, v[12:13]
; GISEL-NEXT: v_xor_b32_e32 v7, v1, v5
; GISEL-NEXT: v_xor_b32_e32 v6, v6, v5
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v4, v0, v[12:13]
; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7
-; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6
-; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v15, v11
-; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v16, v12
-; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v13
+; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6
+; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v17, v11
+; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v18, v14
+; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v12
; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1
-; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v12, vcc
-; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v11, v4, vcc
+; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v18, v14, vcc
+; GISEL-NEXT: v_subb_u32_e32 v14, vcc, v11, v4, vcc
; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1
-; GISEL-NEXT: v_trunc_f32_e32 v16, v11
-; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v16
+; GISEL-NEXT: v_trunc_f32_e32 v15, v11
+; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v15
; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1
; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v7
; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v6, vcc
; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0
-; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v10
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v10
-; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc
+; GISEL-NEXT: v_cvt_u32_f32_e32 v22, v15
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v10
+; GISEL-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v14, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v10
; GISEL-NEXT: v_mov_b32_e32 v1, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v4
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13]
-; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v10
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v22, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v1, v22, v11
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v21, v4
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v20, v18, v[12:13]
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v4
-; GISEL-NEXT: v_mul_lo_u32 v10, v18, v12
+; GISEL-NEXT: v_mul_lo_u32 v10, v18, v14
; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10
; GISEL-NEXT: v_mul_hi_u32 v10, v18, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7]
-; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7]
+; GISEL-NEXT: v_mul_hi_u32 v11, v22, v11
; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9]
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v4
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v21, v4
; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9]
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[8:9]
; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v4, v15, v21, s[4:5]
-; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v10
-; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v15, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v4, v13, v15, s[4:5]
+; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v16, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v10
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v13, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v17, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v15, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e32 v15, v15, v21, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10
-; GISEL-NEXT: v_mul_lo_u32 v13, v16, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
-; GISEL-NEXT: v_mul_hi_u32 v13, v18, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT: v_mul_hi_u32 v12, v16, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13
+; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10
+; GISEL-NEXT: v_mul_lo_u32 v12, v22, v14
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT: v_mul_hi_u32 v12, v18, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v14, v22, v14
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v18, v10
-; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v16, v11, vcc
+; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v22, v11, vcc
; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v19, v12, 0
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GISEL-NEXT: v_mov_b32_e32 v0, v11
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v13, v[0:1]
-; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v12, v[0:1]
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v11
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v11, vcc
-; GISEL-NEXT: v_xor_b32_e32 v15, v1, v11
-; GISEL-NEXT: v_mul_lo_u32 v1, v13, v10
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v16, v13, vcc
+; GISEL-NEXT: v_xor_b32_e32 v13, v9, v8
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v14, v[0:1]
+; GISEL-NEXT: v_xor_b32_e32 v11, v1, v13
+; GISEL-NEXT: v_ashrrev_i32_e32 v15, 31, v3
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v12, v[8:9]
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v15
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v15, vcc
+; GISEL-NEXT: v_xor_b32_e32 v16, v1, v15
+; GISEL-NEXT: v_mul_lo_u32 v1, v14, v10
; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0
-; GISEL-NEXT: v_xor_b32_e32 v16, v2, v11
+; GISEL-NEXT: v_xor_b32_e32 v17, v2, v15
; GISEL-NEXT: v_mul_hi_u32 v2, v12, v10
-; GISEL-NEXT: v_xor_b32_e32 v9, v4, v8
+; GISEL-NEXT: v_mul_hi_u32 v8, v14, v10
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0
-; GISEL-NEXT: v_mul_hi_u32 v4, v13, v10
+; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1
; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3
+; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1
-; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v13, v0, vcc
-; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1
-; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0
-; GISEL-NEXT: v_mul_hi_u32 v4, v15, v1
-; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1
-; GISEL-NEXT: v_xor_b32_e32 v10, v14, v8
+; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v14, v0, vcc
+; GISEL-NEXT: v_mul_lo_u32 v2, v17, v1
+; GISEL-NEXT: v_mul_lo_u32 v3, v16, v0
+; GISEL-NEXT: v_xor_b32_e32 v8, v4, v13
+; GISEL-NEXT: v_mul_hi_u32 v4, v16, v1
+; GISEL-NEXT: v_mul_hi_u32 v1, v17, v1
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v4, v16, v0
+; GISEL-NEXT: v_mul_lo_u32 v4, v17, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT: v_mul_hi_u32 v3, v15, v0
+; GISEL-NEXT: v_mul_hi_u32 v3, v16, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v3
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v1, v2
-; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v12, 0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v1, v2
+; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v10, 0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v1
; GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v13, v[0:1]
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v9, v8
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v12, v[3:4]
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v8, vcc
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v15, v2
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc
-; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v6
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v12, v[0:1]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v11, v13
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v13, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v10, v[3:4]
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v16, v2
+; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v17, v8, vcc
+; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v17, v8
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v6
+; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v4, v6, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v6
-; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v4, v8, v9, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v12
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v6
+; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v9, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v10
+; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v12, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v8
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v8
; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v6, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
-; GISEL-NEXT: v_xor_b32_e32 v4, v11, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v6, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc
+; GISEL-NEXT: v_xor_b32_e32 v4, v15, v5
; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4
; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
@@ -667,100 +667,100 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_xor_b32_e32 v1, v3, v0
; CGP-NEXT: v_cvt_f32_u32_e32 v3, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v1
-; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v2
-; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v1, vcc
+; CGP-NEXT: v_sub_i32_e32 v15, vcc, 0, v2
+; CGP-NEXT: v_subb_u32_e32 v16, vcc, 0, v1, vcc
; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4
; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3
; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3
; CGP-NEXT: v_trunc_f32_e32 v5, v4
; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5
-; CGP-NEXT: v_cvt_u32_f32_e32 v12, v3
-; CGP-NEXT: v_cvt_u32_f32_e32 v15, v5
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5]
-; CGP-NEXT: v_mul_hi_u32 v16, v12, v3
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5]
-; CGP-NEXT: v_mul_lo_u32 v5, v15, v3
-; CGP-NEXT: v_mul_hi_u32 v3, v15, v3
-; CGP-NEXT: v_mul_lo_u32 v17, v12, v4
-; CGP-NEXT: v_mul_lo_u32 v18, v15, v4
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v17
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v16
-; CGP-NEXT: v_mul_hi_u32 v16, v12, v4
+; CGP-NEXT: v_cvt_u32_f32_e32 v14, v3
+; CGP-NEXT: v_cvt_u32_f32_e32 v17, v5
+; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v15, v14, 0
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[4:5]
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[12:13]
+; CGP-NEXT: v_mul_lo_u32 v5, v17, v3
+; CGP-NEXT: v_mul_hi_u32 v12, v14, v3
+; CGP-NEXT: v_mul_lo_u32 v13, v14, v4
+; CGP-NEXT: v_mul_hi_u32 v3, v17, v3
+; CGP-NEXT: v_mul_lo_u32 v18, v17, v4
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12
+; CGP-NEXT: v_mul_hi_u32 v12, v14, v4
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v17, v5
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5
; CGP-NEXT: v_add_i32_e32 v3, vcc, v18, v3
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v16
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16
-; CGP-NEXT: v_mul_hi_u32 v4, v15, v4
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT: v_mul_hi_u32 v4, v17, v4
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v3
-; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v4, vcc
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5]
-; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v11
-; CGP-NEXT: v_mul_hi_u32 v16, v12, v3
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5]
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v13
-; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v13, vcc
-; CGP-NEXT: v_xor_b32_e32 v11, v5, v13
-; CGP-NEXT: v_mul_lo_u32 v5, v15, v3
-; CGP-NEXT: v_mul_lo_u32 v14, v12, v4
-; CGP-NEXT: v_mul_hi_u32 v3, v15, v3
-; CGP-NEXT: v_xor_b32_e32 v10, v10, v13
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v16
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v3
+; CGP-NEXT: v_addc_u32_e32 v17, vcc, v17, v4, vcc
+; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v15, v14, 0
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[4:5]
+; CGP-NEXT: v_ashrrev_i32_e32 v15, 31, v11
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[12:13]
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v15
+; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v15, vcc
+; CGP-NEXT: v_xor_b32_e32 v12, v5, v15
+; CGP-NEXT: v_mul_lo_u32 v5, v17, v3
+; CGP-NEXT: v_mul_lo_u32 v11, v14, v4
+; CGP-NEXT: v_xor_b32_e32 v13, v10, v15
+; CGP-NEXT: v_mul_hi_u32 v10, v14, v3
+; CGP-NEXT: v_mul_hi_u32 v3, v17, v3
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v16, v15, v4
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5
-; CGP-NEXT: v_mul_hi_u32 v14, v12, v4
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v16, v3
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14
-; CGP-NEXT: v_mul_hi_u32 v4, v15, v4
+; CGP-NEXT: v_mul_lo_u32 v10, v17, v4
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5
+; CGP-NEXT: v_mul_hi_u32 v11, v14, v4
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v3
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT: v_mul_hi_u32 v4, v17, v4
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3
-; CGP-NEXT: v_addc_u32_e32 v4, vcc, v15, v4, vcc
-; CGP-NEXT: v_mul_lo_u32 v5, v10, v3
-; CGP-NEXT: v_mul_lo_u32 v12, v11, v4
-; CGP-NEXT: v_mul_hi_u32 v14, v11, v3
-; CGP-NEXT: v_mul_hi_u32 v3, v10, v3
-; CGP-NEXT: v_mul_hi_u32 v15, v10, v4
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v14, v10, v4
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5
-; CGP-NEXT: v_mul_hi_u32 v12, v11, v4
; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12
+; CGP-NEXT: v_addc_u32_e32 v4, vcc, v17, v4, vcc
+; CGP-NEXT: v_mul_lo_u32 v5, v13, v3
+; CGP-NEXT: v_mul_lo_u32 v10, v12, v4
+; CGP-NEXT: v_mul_hi_u32 v11, v12, v3
+; CGP-NEXT: v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v11, v13, v4
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT: v_mul_hi_u32 v10, v12, v4
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v11, v3
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v10
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; CGP-NEXT: v_add_i32_e32 v14, vcc, v3, v5
+; CGP-NEXT: v_mul_hi_u32 v11, v13, v4
; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v14, 0
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v5
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v12, v[4:5]
-; CGP-NEXT: v_sub_i32_e32 v3, vcc, v11, v3
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[4:5]
-; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v10, v4, vcc
-; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v10, v4
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v11, v5
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v16, v[4:5]
+; CGP-NEXT: v_sub_i32_e32 v3, vcc, v12, v3
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[10:11]
+; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v13, v4, vcc
+; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v13, v4
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1
; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
@@ -771,13 +771,13 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
; CGP-NEXT: v_cndmask_b32_e64 v5, v10, v11, s[4:5]
; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v14
-; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc
+; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v16, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; CGP-NEXT: v_cndmask_b32_e32 v1, v15, v2, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v10
; CGP-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
@@ -785,8 +785,8 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc
-; CGP-NEXT: v_xor_b32_e32 v3, v13, v0
-; CGP-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
+; CGP-NEXT: v_xor_b32_e32 v3, v15, v0
+; CGP-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc
; CGP-NEXT: v_xor_b32_e32 v0, v1, v3
; CGP-NEXT: v_xor_b32_e32 v1, v2, v3
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
@@ -840,126 +840,126 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_xor_b32_e32 v3, v5, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v5, v4
; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
-; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v4
-; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc
+; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v4
+; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v3, vcc
; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5
; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
; CGP-NEXT: v_trunc_f32_e32 v7, v6
; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7
-; CGP-NEXT: v_cvt_u32_f32_e32 v10, v5
-; CGP-NEXT: v_cvt_u32_f32_e32 v13, v7
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7]
-; CGP-NEXT: v_mul_hi_u32 v14, v10, v5
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7]
-; CGP-NEXT: v_mul_lo_u32 v7, v13, v5
-; CGP-NEXT: v_mul_hi_u32 v5, v13, v5
-; CGP-NEXT: v_mul_lo_u32 v15, v10, v6
-; CGP-NEXT: v_mul_lo_u32 v16, v13, v6
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14
-; CGP-NEXT: v_mul_hi_u32 v14, v10, v6
+; CGP-NEXT: v_cvt_u32_f32_e32 v12, v5
+; CGP-NEXT: v_cvt_u32_f32_e32 v15, v7
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[6:7]
+; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[10:11]
+; CGP-NEXT: v_mul_lo_u32 v7, v15, v5
+; CGP-NEXT: v_mul_hi_u32 v10, v12, v5
+; CGP-NEXT: v_mul_lo_u32 v11, v12, v6
+; CGP-NEXT: v_mul_hi_u32 v5, v15, v5
+; CGP-NEXT: v_mul_lo_u32 v16, v15, v6
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10
+; CGP-NEXT: v_mul_hi_u32 v10, v12, v6
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v15, v7
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7
; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_mul_hi_u32 v6, v13, v6
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT: v_mul_hi_u32 v6, v15, v6
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v14, v7
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7
; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5
-; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v6, vcc
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7]
-; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v9
-; CGP-NEXT: v_mul_hi_u32 v14, v10, v5
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7]
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v11
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v11, vcc
-; CGP-NEXT: v_xor_b32_e32 v9, v7, v11
-; CGP-NEXT: v_mul_lo_u32 v7, v13, v5
-; CGP-NEXT: v_mul_lo_u32 v12, v10, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v13, v5
-; CGP-NEXT: v_xor_b32_e32 v8, v8, v11
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v5
+; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v6, vcc
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[6:7]
+; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v9
+; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[10:11]
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v13
+; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v13, vcc
+; CGP-NEXT: v_xor_b32_e32 v11, v7, v13
+; CGP-NEXT: v_mul_lo_u32 v7, v15, v5
+; CGP-NEXT: v_mul_lo_u32 v9, v12, v6
+; CGP-NEXT: v_xor_b32_e32 v14, v8, v13
+; CGP-NEXT: v_mul_hi_u32 v8, v12, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v15, v5
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v14, v13, v6
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7
-; CGP-NEXT: v_mul_hi_u32 v12, v10, v6
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12
-; CGP-NEXT: v_mul_hi_u32 v6, v13, v6
+; CGP-NEXT: v_mul_lo_u32 v8, v15, v6
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7
+; CGP-NEXT: v_mul_hi_u32 v9, v12, v6
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT: v_mul_hi_u32 v6, v15, v6
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
-; CGP-NEXT: v_addc_u32_e32 v6, vcc, v13, v6, vcc
-; CGP-NEXT: v_mul_lo_u32 v7, v8, v5
-; CGP-NEXT: v_mul_lo_u32 v10, v9, v6
-; CGP-NEXT: v_mul_hi_u32 v12, v9, v5
-; CGP-NEXT: v_mul_hi_u32 v5, v8, v5
-; CGP-NEXT: v_mul_hi_u32 v13, v8, v6
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v12, v8, v6
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7
-; CGP-NEXT: v_mul_hi_u32 v10, v9, v6
; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10
+; CGP-NEXT: v_addc_u32_e32 v6, vcc, v15, v6, vcc
+; CGP-NEXT: v_mul_lo_u32 v7, v14, v5
+; CGP-NEXT: v_mul_lo_u32 v8, v11, v6
+; CGP-NEXT: v_mul_hi_u32 v9, v11, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v14, v5
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v9, v14, v6
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT: v_mul_hi_u32 v8, v11, v6
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
; CGP-NEXT: v_add_i32_e32 v12, vcc, v5, v7
+; CGP-NEXT: v_mul_hi_u32 v9, v14, v6
; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v12, 0
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v7
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, v[6:7]
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v9, v5
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[6:7]
-; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v8, v6, vcc
-; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v8, v6
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3
-; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v7
+; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v15, v[6:7]
+; CGP-NEXT: v_sub_i32_e32 v5, vcc, v11, v5
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v12, v[7:8]
+; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v14, v9, vcc
+; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v14, v9
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4
; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v4
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3
-; CGP-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v7, v8, v9, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3
+; CGP-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v6, v8, v9, s[4:5]
; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v12
-; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
+; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v15, vcc
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
-; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v8
; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc
; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
-; CGP-NEXT: v_xor_b32_e32 v5, v11, v2
-; CGP-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc
+; CGP-NEXT: v_xor_b32_e32 v5, v13, v2
+; CGP-NEXT: v_cndmask_b32_e32 v4, v15, v4, vcc
; CGP-NEXT: v_xor_b32_e32 v2, v3, v5
; CGP-NEXT: v_xor_b32_e32 v3, v4, v5
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5
@@ -1049,82 +1049,82 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb
; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT: v_mov_b32_e32 v6, 0xffed2705
+; CHECK-NEXT: v_mov_b32_e32 v9, 0xffed2705
; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; CHECK-NEXT: v_trunc_f32_e32 v4, v3
; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3
-; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3
-; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3
-; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2
+; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v4
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT: v_mul_lo_u32 v5, v8, v6
+; CHECK-NEXT: v_mul_lo_u32 v7, v10, v6
+; CHECK-NEXT: v_mul_hi_u32 v11, v8, v6
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2
-; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6
-; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2
-; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3
-; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6
-; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT: v_mul_hi_u32 v5, v10, v6
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2
+; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT: v_xor_b32_e32 v4, v0, v9
+; CHECK-NEXT: v_mul_lo_u32 v0, v10, v2
+; CHECK-NEXT: v_mul_lo_u32 v3, v8, v6
+; CHECK-NEXT: v_xor_b32_e32 v5, v1, v9
+; CHECK-NEXT: v_mul_hi_u32 v1, v8, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3
+; CHECK-NEXT: v_mul_lo_u32 v1, v10, v6
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT: v_mul_hi_u32 v3, v8, v6
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_mul_hi_u32 v3, v10, v6
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0
; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1
; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0
-; CHECK-NEXT: v_mov_b32_e32 v5, 0x12d8fb
+; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0
+; CHECK-NEXT: v_mov_b32_e32 v6, 0x12d8fb
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1
+; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0
@@ -1133,40 +1133,40 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
; CHECK-NEXT: v_add_i32_e32 v7, vcc, v0, v2
-; CHECK-NEXT: v_mul_hi_u32 v8, v9, v1
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v7, 0
+; CHECK-NEXT: v_mul_hi_u32 v8, v5, v1
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v2
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v3, v[1:2]
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[1:2]
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0
-; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc
-; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1
-; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2
-; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[4:5]
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v7
-; CHECK-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5
+; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc
+; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v5, v2
+; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
+; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v1, -1, v3, s[4:5]
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v7
+; CHECK-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v6
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; CHECK-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v4
-; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v3
+; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v4, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; CHECK-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = sdiv i64 %num, 1235195
ret i64 %result
@@ -1215,46 +1215,46 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v8, v4
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, v8, v4
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc
; GISEL-NEXT: v_mov_b32_e32 v4, v14
; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v7, v[4:5]
; GISEL-NEXT: v_mul_lo_u32 v4, v7, v13
-; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15]
; GISEL-NEXT: s_mov_b32 s6, 1
; GISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GISEL-NEXT: v_mul_lo_u32 v15, v16, v14
+; GISEL-NEXT: v_mul_lo_u32 v14, v18, v16
; GISEL-NEXT: s_subb_u32 s6, 0, 0
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v15
-; GISEL-NEXT: v_mul_hi_u32 v15, v16, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14
+; GISEL-NEXT: v_mul_hi_u32 v14, v18, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_mul_hi_u32 v13, v7, v13
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v15
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v15, v7, v14
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v4
-; GISEL-NEXT: v_mul_hi_u32 v4, v16, v14
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v14, v7, v16
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4
+; GISEL-NEXT: v_mul_hi_u32 v4, v18, v16
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v4
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v4
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GISEL-NEXT: v_xor_b32_e32 v18, v0, v4
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v17
-; GISEL-NEXT: v_mul_hi_u32 v13, v7, v14
+; GISEL-NEXT: v_xor_b32_e32 v17, v0, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v15
+; GISEL-NEXT: v_mul_hi_u32 v13, v7, v16
; GISEL-NEXT: v_xor_b32_e32 v19, v1, v4
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v15, v1
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v14, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0
-; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1
-; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0
+; GISEL-NEXT: v_mul_lo_u32 v14, v17, v1
+; GISEL-NEXT: v_mul_hi_u32 v15, v17, v0
; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0
; GISEL-NEXT: v_mov_b32_e32 v7, 0x12d8fb
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
@@ -1263,46 +1263,46 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1
+; GISEL-NEXT: v_mul_hi_u32 v14, v17, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13
-; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v15, 0
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, v0, v13
+; GISEL-NEXT: v_mul_hi_u32 v15, v19, v1
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v18, 0
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v13
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v7, v16, v[1:2]
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v0
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14]
-; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc
-; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13
+; GISEL-NEXT: v_add_i32_e32 v20, vcc, v15, v13
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v7, v20, v[1:2]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v17, v0
+; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14]
+; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v15, vcc
+; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v15
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v7
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v15
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v18
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v20, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v7
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v19, -1, v0, vcc
; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1]
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v14, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14]
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v16
+; GISEL-NEXT: v_mul_lo_u32 v13, v8, v0
; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v17, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
-; GISEL-NEXT: v_mul_lo_u32 v18, v8, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v1, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
+; GISEL-NEXT: v_cndmask_b32_e32 v14, v16, v1, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v13
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -1319,74 +1319,74 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v1
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
-; GISEL-NEXT: v_cndmask_b32_e32 v11, v16, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
-; GISEL-NEXT: v_cndmask_b32_e32 v10, v15, v13, vcc
-; GISEL-NEXT: v_xor_b32_e32 v1, v10, v4
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6]
-; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; GISEL-NEXT: v_xor_b32_e32 v12, v2, v10
-; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0
-; GISEL-NEXT: v_mul_lo_u32 v6, v8, v5
-; GISEL-NEXT: v_xor_b32_e32 v13, v3, v10
-; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v8, v1
+; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v9, v0, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15
+; GISEL-NEXT: v_cndmask_b32_e32 v13, v20, v5, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2]
+; GISEL-NEXT: v_cndmask_b32_e32 v12, v18, v14, vcc
+; GISEL-NEXT: v_xor_b32_e32 v1, v12, v4
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9]
+; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; GISEL-NEXT: v_xor_b32_e32 v8, v2, v12
+; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0
+; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5
+; GISEL-NEXT: v_xor_b32_e32 v9, v3, v12
+; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5
+; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT: v_mul_hi_u32 v6, v8, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5
+; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0
-; GISEL-NEXT: v_mul_lo_u32 v5, v12, v2
-; GISEL-NEXT: v_mul_hi_u32 v6, v12, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT: v_xor_b32_e32 v8, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v3, v9, v0
+; GISEL-NEXT: v_mul_lo_u32 v5, v8, v2
+; GISEL-NEXT: v_mul_hi_u32 v6, v8, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT: v_xor_b32_e32 v10, v13, v4
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v6, v13, v2
+; GISEL-NEXT: v_mul_lo_u32 v6, v9, v2
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
-; GISEL-NEXT: v_mul_hi_u32 v5, v12, v2
+; GISEL-NEXT: v_mul_hi_u32 v5, v8, v2
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3
-; GISEL-NEXT: v_mul_hi_u32 v6, v13, v2
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v9, 0
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3
+; GISEL-NEXT: v_mul_hi_u32 v6, v9, v2
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v11, 0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v0
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v6, v0
; GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v11, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v13, v[0:1]
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v9, v[5:6]
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v3, vcc
-; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v13, v3
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v11, v[5:6]
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v8, v2
+; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v9, v3, vcc
+; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v9, v3
; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7
@@ -1394,8 +1394,8 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4
; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v9
-; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v11, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v11
+; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v13, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
@@ -1406,12 +1406,12 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
+; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12
+; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_sdiv_v2i64_oddk_denom:
@@ -1430,112 +1430,112 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8
; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
; CGP-NEXT: v_mov_b32_e32 v9, v5
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; CGP-NEXT: v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10]
; CGP-NEXT: v_mul_hi_u32 v12, v8, v4
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10]
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11]
; CGP-NEXT: v_mul_lo_u32 v10, v8, v4
-; CGP-NEXT: v_mul_lo_u32 v4, v7, v9
-; CGP-NEXT: v_mul_lo_u32 v13, v8, v9
-; CGP-NEXT: v_mul_hi_u32 v14, v7, v9
-; CGP-NEXT: v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT: v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT: v_mul_lo_u32 v4, v7, v13
+; CGP-NEXT: v_mul_lo_u32 v9, v8, v13
+; CGP-NEXT: v_mul_hi_u32 v14, v7, v13
+; CGP-NEXT: v_mul_hi_u32 v13, v8, v13
; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9
+; CGP-NEXT: v_add_i32_e32 v18, vcc, v7, v4
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
+; CGP-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc
; CGP-NEXT: v_mov_b32_e32 v4, v14
-; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; CGP-NEXT: v_mul_lo_u32 v4, v17, v13
-; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
-; CGP-NEXT: v_mul_lo_u32 v9, v16, v14
+; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5]
+; CGP-NEXT: v_mul_lo_u32 v4, v19, v13
+; CGP-NEXT: v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15]
+; CGP-NEXT: v_mul_lo_u32 v9, v18, v16
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT: v_mul_hi_u32 v9, v16, v13
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v9, v18, v13
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_mul_hi_u32 v9, v17, v13
-; CGP-NEXT: v_mul_lo_u32 v13, v17, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_mul_hi_u32 v15, v16, v14
+; CGP-NEXT: v_mul_hi_u32 v9, v19, v13
+; CGP-NEXT: v_mul_lo_u32 v13, v19, v16
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4
+; CGP-NEXT: v_mul_hi_u32 v14, v18, v16
; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v9, v14
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9
; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9
; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; CGP-NEXT: v_xor_b32_e32 v18, v0, v9
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v17, v14
-; CGP-NEXT: v_xor_b32_e32 v19, v1, v9
+; CGP-NEXT: v_xor_b32_e32 v15, v0, v9
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v19, v16
+; CGP-NEXT: v_xor_b32_e32 v17, v1, v9
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1
; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v19, v0
-; CGP-NEXT: v_mul_lo_u32 v14, v18, v1
-; CGP-NEXT: v_mul_hi_u32 v15, v18, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v19, v0
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v0
+; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; CGP-NEXT: v_mul_lo_u32 v13, v17, v0
+; CGP-NEXT: v_mul_lo_u32 v14, v15, v1
+; CGP-NEXT: v_mul_hi_u32 v16, v15, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v17, v0
; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v15, v19, v1
+; CGP-NEXT: v_mul_lo_u32 v16, v17, v1
; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_mul_hi_u32 v14, v18, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v14, v15, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
+; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v0, v13
-; CGP-NEXT: v_mul_hi_u32 v16, v19, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v0, v13
+; CGP-NEXT: v_mul_hi_u32 v18, v17, v1
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v13
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2]
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v18, v0
-; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc
-; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13
+; CGP-NEXT: v_add_i32_e32 v18, vcc, v18, v13
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v18, v[1:2]
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v15, v0
+; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v17, v13, vcc
+; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v17, v13
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v1, vcc
-; CGP-NEXT: v_add_i32_e32 v17, vcc, 1, v15
-; CGP-NEXT: v_addc_u32_e32 v18, vcc, 0, v16, vcc
+; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; CGP-NEXT: v_add_i32_e32 v17, vcc, 1, v16
+; CGP-NEXT: v_addc_u32_e32 v19, vcc, 0, v18, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
; CGP-NEXT: v_mov_b32_e32 v0, v5
-; CGP-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5]
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; CGP-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1]
-; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v19, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v15, -1, v14, s[4:5]
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1]
+; CGP-NEXT: v_cndmask_b32_e64 v20, 0, -1, vcc
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14]
+; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v20, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v17
-; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v18, vcc
+; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v19, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; CGP-NEXT: v_mul_lo_u32 v5, v7, v0
-; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v13, v18, v13, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v14, v17, v1, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11
@@ -1553,72 +1553,72 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
-; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v17, vcc
-; CGP-NEXT: v_xor_b32_e32 v11, v5, v9
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; CGP-NEXT: v_cndmask_b32_e32 v10, v16, v13, vcc
-; CGP-NEXT: v_xor_b32_e32 v1, v10, v9
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6]
-; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; CGP-NEXT: v_xor_b32_e32 v12, v2, v10
-; CGP-NEXT: v_mul_lo_u32 v2, v8, v0
-; CGP-NEXT: v_mul_lo_u32 v6, v7, v5
-; CGP-NEXT: v_xor_b32_e32 v13, v3, v10
-; CGP-NEXT: v_mul_hi_u32 v3, v7, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1
+; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15
+; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v14, vcc
+; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2]
+; CGP-NEXT: v_cndmask_b32_e32 v12, v18, v13, vcc
+; CGP-NEXT: v_xor_b32_e32 v13, v5, v9
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8]
+; CGP-NEXT: v_xor_b32_e32 v1, v12, v9
+; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; CGP-NEXT: v_xor_b32_e32 v7, v2, v12
+; CGP-NEXT: v_mul_lo_u32 v2, v11, v0
+; CGP-NEXT: v_mul_lo_u32 v6, v10, v5
+; CGP-NEXT: v_xor_b32_e32 v8, v3, v12
+; CGP-NEXT: v_mul_hi_u32 v3, v10, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v11, v0
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v3, v8, v5
+; CGP-NEXT: v_mul_lo_u32 v3, v11, v5
; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT: v_mul_hi_u32 v6, v7, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v10, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v8, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v11, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0
-; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; CGP-NEXT: v_mul_lo_u32 v5, v13, v3
-; CGP-NEXT: v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT: v_mul_hi_u32 v7, v12, v3
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0
+; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; CGP-NEXT: v_mul_lo_u32 v5, v8, v3
+; CGP-NEXT: v_mul_lo_u32 v6, v7, v2
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9
; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CGP-NEXT: v_mul_hi_u32 v9, v7, v3
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v7, v13, v2
-; CGP-NEXT: v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT: v_mul_lo_u32 v9, v8, v2
+; CGP-NEXT: v_mul_hi_u32 v3, v8, v3
; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT: v_mul_hi_u32 v6, v12, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v6, v7, v2
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v3, v5
-; CGP-NEXT: v_mul_hi_u32 v8, v13, v2
-; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v3, v5
+; CGP-NEXT: v_mul_hi_u32 v10, v8, v2
+; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v5
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v8, v[3:4]
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v10, v[3:4]
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2
+; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc
+; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5
; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
@@ -1626,24 +1626,24 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3
; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; CGP-NEXT: v_cndmask_b32_e64 v3, -1, v6, s[4:5]
-; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v7
-; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v8, vcc
+; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v9
+; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
; CGP-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v6
-; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc
+; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; CGP-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc
-; CGP-NEXT: v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v12
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v12
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc
; CGP-NEXT: s_setpc_b64 s[30:31]
%result = sdiv <2 x i64> %num, <i64 1235195, i64 1235195>
ret <2 x i64> %result
@@ -1679,126 +1679,126 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-NEXT: v_xor_b32_e32 v1, v5, v0
; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v2
; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1
-; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v2
-; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc
+; CHECK-NEXT: v_sub_i32_e32 v12, vcc, 0, v2
+; CHECK-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc
; CHECK-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
; CHECK-NEXT: v_rcp_iflag_f32_e32 v5, v5
; CHECK-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
; CHECK-NEXT: v_trunc_f32_e32 v7, v6
; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7
-; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v5
-; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v7
-; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7]
-; CHECK-NEXT: v_mul_hi_u32 v12, v8, v5
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT: v_mul_lo_u32 v7, v11, v5
-; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5
-; CHECK-NEXT: v_mul_lo_u32 v13, v8, v6
-; CHECK-NEXT: v_mul_lo_u32 v14, v11, v6
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v13
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v12
-; CHECK-NEXT: v_mul_hi_u32 v12, v8, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v13, v7
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v14, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v5
+; CHECK-NEXT: v_cvt_u32_f32_e32 v14, v7
+; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v11, 0
+; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[6:7]
+; CHECK-NEXT: v_mul_lo_u32 v6, v14, v5
+; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8]
+; CHECK-NEXT: v_mul_hi_u32 v7, v11, v5
+; CHECK-NEXT: v_mul_hi_u32 v5, v14, v5
+; CHECK-NEXT: v_mul_lo_u32 v8, v11, v9
+; CHECK-NEXT: v_mul_lo_u32 v10, v14, v9
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT: v_mul_hi_u32 v7, v11, v9
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v5
-; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v6, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7]
-; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v9
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v9, vcc
-; CHECK-NEXT: v_xor_b32_e32 v7, v3, v9
-; CHECK-NEXT: v_mul_lo_u32 v3, v11, v5
-; CHECK-NEXT: v_mul_lo_u32 v10, v8, v6
-; CHECK-NEXT: v_xor_b32_e32 v12, v4, v9
-; CHECK-NEXT: v_mul_hi_u32 v4, v8, v5
-; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v10
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT: v_mul_hi_u32 v8, v14, v9
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v5
+; CHECK-NEXT: v_addc_u32_e32 v14, vcc, v14, v6, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v11, 0
+; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[6:7]
+; CHECK-NEXT: v_ashrrev_i32_e32 v12, 31, v4
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12
+; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8]
+; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v12, vcc
+; CHECK-NEXT: v_xor_b32_e32 v10, v3, v12
+; CHECK-NEXT: v_mul_lo_u32 v3, v14, v5
+; CHECK-NEXT: v_mul_lo_u32 v6, v11, v9
+; CHECK-NEXT: v_xor_b32_e32 v13, v4, v12
+; CHECK-NEXT: v_mul_hi_u32 v4, v11, v5
+; CHECK-NEXT: v_mul_hi_u32 v5, v14, v5
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v4, v11, v6
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v10, v3
-; CHECK-NEXT: v_mul_hi_u32 v10, v8, v6
+; CHECK-NEXT: v_mul_lo_u32 v4, v14, v9
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3
+; CHECK-NEXT: v_mul_hi_u32 v6, v11, v9
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v10
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT: v_mul_hi_u32 v6, v14, v9
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3
-; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v11, v4, vcc
-; CHECK-NEXT: v_mul_lo_u32 v5, v12, v3
-; CHECK-NEXT: v_mul_lo_u32 v6, v7, v4
-; CHECK-NEXT: v_mul_hi_u32 v8, v7, v3
-; CHECK-NEXT: v_mul_hi_u32 v3, v12, v3
-; CHECK-NEXT: v_mul_hi_u32 v10, v12, v4
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v11, v3
+; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc
+; CHECK-NEXT: v_mul_lo_u32 v5, v13, v3
+; CHECK-NEXT: v_mul_lo_u32 v6, v10, v4
+; CHECK-NEXT: v_mul_hi_u32 v7, v10, v3
+; CHECK-NEXT: v_mul_hi_u32 v3, v13, v3
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v8, v12, v4
+; CHECK-NEXT: v_mul_lo_u32 v7, v13, v4
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_mul_hi_u32 v6, v7, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_mul_hi_u32 v6, v10, v4
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v3, v5
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v8, 0
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v3, v5
+; CHECK-NEXT: v_mul_hi_u32 v7, v13, v4
+; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v9, 0
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v10, v5
-; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v8, v[4:5]
-; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v4, vcc
-; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v4
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1
-; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v7, v5
+; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[4:5]
+; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v10, v3
+; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v9, v[5:6]
+; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v7, vcc
+; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v7
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2
; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v1
-; CHECK-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[4:5]
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, 1, v8
-; CHECK-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1
-; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v9
+; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v11, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v2, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v7
-; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v10, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v6
+; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v7, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v3, v9, v0
-; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
+; CHECK-NEXT: v_xor_b32_e32 v3, v12, v0
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3
; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
@@ -1850,8 +1850,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_xor_b32_e32 v5, v7, v4
; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v8
; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v5
-; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v8
-; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v5, vcc
+; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v8
+; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v5, vcc
; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v11
; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7
; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v10
@@ -1859,182 +1859,183 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v7
; GISEL-NEXT: v_trunc_f32_e32 v13, v11
; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v13
-; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v7
-; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v13
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
+; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v7
+; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v13
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0
; GISEL-NEXT: v_mov_b32_e32 v7, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8]
-; GISEL-NEXT: v_mul_lo_u32 v7, v17, v11
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
-; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13
-; GISEL-NEXT: v_mul_hi_u32 v13, v14, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[7:8]
+; GISEL-NEXT: v_mul_lo_u32 v7, v19, v11
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
+; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v16, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v11, v19, v11
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v17, v12
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v18, v7
-; GISEL-NEXT: v_mul_hi_u32 v18, v14, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT: v_mul_lo_u32 v12, v19, v14
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7
+; GISEL-NEXT: v_mul_hi_u32 v13, v16, v14
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18
-; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT: v_mul_hi_u32 v13, v19, v14
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v7
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v11, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v7
+; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v19, v11, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0
; GISEL-NEXT: v_mov_b32_e32 v7, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8]
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[7:8]
; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; GISEL-NEXT: v_xor_b32_e32 v15, v0, v7
-; GISEL-NEXT: v_mul_lo_u32 v0, v17, v11
-; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12
-; GISEL-NEXT: v_xor_b32_e32 v16, v1, v7
-; GISEL-NEXT: v_mul_hi_u32 v1, v14, v11
-; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_xor_b32_e32 v17, v0, v7
+; GISEL-NEXT: v_mul_lo_u32 v0, v19, v11
+; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14
+; GISEL-NEXT: v_xor_b32_e32 v18, v1, v7
+; GISEL-NEXT: v_mul_hi_u32 v1, v16, v11
+; GISEL-NEXT: v_mul_hi_u32 v11, v19, v11
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0
-; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12
+; GISEL-NEXT: v_mul_lo_u32 v1, v19, v14
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT: v_mul_hi_u32 v12, v16, v14
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v19, v14
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v11, v16, v0
-; GISEL-NEXT: v_mul_lo_u32 v12, v15, v1
-; GISEL-NEXT: v_mul_hi_u32 v13, v15, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v11, v18, v0
+; GISEL-NEXT: v_mul_lo_u32 v12, v17, v1
+; GISEL-NEXT: v_mul_hi_u32 v13, v17, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0
; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v16, v1
+; GISEL-NEXT: v_mul_lo_u32 v13, v18, v1
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT: v_mul_hi_u32 v12, v15, v1
+; GISEL-NEXT: v_mul_hi_u32 v12, v17, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v12
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11
-; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1
+; GISEL-NEXT: v_mul_hi_u32 v1, v18, v1
; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v0, 0
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v1, v13
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v1, v13
; GISEL-NEXT: v_mov_b32_e32 v1, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v14, v[1:2]
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v16, v[1:2]
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v6
; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v10, v6, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v5, v0, v[12:13]
; GISEL-NEXT: v_xor_b32_e32 v10, v1, v6
; GISEL-NEXT: v_xor_b32_e32 v9, v9, v6
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v5, v0, v[12:13]
; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v10
-; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v9
-; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v15, v11
-; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v16, v12
-; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v13
+; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v9
+; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v17, v11
+; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v18, v14
+; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v12
; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1
-; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v12, vcc
-; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v11, v5, vcc
+; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v18, v14, vcc
+; GISEL-NEXT: v_subb_u32_e32 v14, vcc, v11, v5, vcc
; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1
-; GISEL-NEXT: v_trunc_f32_e32 v16, v11
-; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v16
+; GISEL-NEXT: v_trunc_f32_e32 v15, v11
+; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v15
; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1
; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v10
; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v9, vcc
; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0
-; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v8
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v8
-; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc
+; GISEL-NEXT: v_cvt_u32_f32_e32 v22, v15
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v8
+; GISEL-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v14, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v8
; GISEL-NEXT: v_mov_b32_e32 v1, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v5
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13]
-; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v8
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v22, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v1, v22, v11
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v21, v5
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v20, v18, v[12:13]
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v5
-; GISEL-NEXT: v_mul_lo_u32 v8, v18, v12
+; GISEL-NEXT: v_mul_lo_u32 v8, v18, v14
; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8
; GISEL-NEXT: v_mul_hi_u32 v8, v18, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7]
-; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7]
+; GISEL-NEXT: v_mul_hi_u32 v11, v22, v11
; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9]
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v5
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v21, v5
; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9]
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[8:9]
; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v5, v15, v21, s[4:5]
-; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v8
-; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v15, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v15, s[4:5]
+; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v16, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v8
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v13, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v17, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v21, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7]
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
-; GISEL-NEXT: v_mul_lo_u32 v15, v16, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v15, v11
-; GISEL-NEXT: v_mul_hi_u32 v15, v18, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT: v_mul_hi_u32 v12, v16, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v15, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v13, v17, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7]
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT: v_mul_lo_u32 v13, v22, v14
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT: v_mul_hi_u32 v13, v18, v14
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15
+; GISEL-NEXT: v_mul_hi_u32 v14, v22, v14
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v11
-; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v16, v12, vcc
+; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v22, v12, vcc
; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v19, v13, 0
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v15, v[0:1]
-; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v8, v14, v8, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v13, v[0:1]
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v12
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v12, vcc
-; GISEL-NEXT: v_xor_b32_e32 v14, v1, v12
-; GISEL-NEXT: v_mul_lo_u32 v1, v15, v11
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v14, v[0:1]
+; GISEL-NEXT: v_xor_b32_e32 v12, v1, v7
+; GISEL-NEXT: v_ashrrev_i32_e32 v15, 31, v3
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v13, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v15
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v15, vcc
+; GISEL-NEXT: v_xor_b32_e32 v5, v1, v15
+; GISEL-NEXT: v_mul_lo_u32 v1, v14, v11
; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0
-; GISEL-NEXT: v_xor_b32_e32 v16, v2, v12
+; GISEL-NEXT: v_xor_b32_e32 v16, v2, v15
; GISEL-NEXT: v_mul_hi_u32 v2, v13, v11
-; GISEL-NEXT: v_mul_hi_u32 v4, v15, v11
+; GISEL-NEXT: v_mul_hi_u32 v4, v14, v11
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v2, v15, v0
+; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1
; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
@@ -2042,25 +2043,25 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v15, v0, vcc
+; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v14, v0, vcc
; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1
-; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0
-; GISEL-NEXT: v_mul_hi_u32 v4, v14, v1
+; GISEL-NEXT: v_mul_lo_u32 v3, v5, v0
+; GISEL-NEXT: v_mul_hi_u32 v4, v5, v1
; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1
-; GISEL-NEXT: v_xor_b32_e32 v5, v5, v7
+; GISEL-NEXT: v_xor_b32_e32 v8, v8, v7
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_mul_lo_u32 v4, v16, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT: v_mul_hi_u32 v3, v14, v0
+; GISEL-NEXT: v_mul_hi_u32 v3, v5, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
@@ -2074,39 +2075,38 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1
; GISEL-NEXT: v_mov_b32_e32 v0, v3
; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v13, v[0:1]
-; GISEL-NEXT: v_xor_b32_e32 v8, v8, v7
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v5, v7
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v12, v7
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc
-; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v9
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v9, v11, v[3:4]
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v5, v2
+; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v16, v7, vcc
+; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v16, v7
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v9
+; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v4, v9, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v10
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v9
-; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9
+; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v7, s[4:5]
; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v11
; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v13, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v4, v9
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v9
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v4, v9
; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v5
; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v7, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
-; GISEL-NEXT: v_xor_b32_e32 v4, v12, v6
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc
+; GISEL-NEXT: v_xor_b32_e32 v4, v15, v6
; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4
; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
@@ -2138,126 +2138,126 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_xor_b32_e32 v1, v10, v0
; CGP-NEXT: v_cvt_f32_u32_e32 v10, v4
; CGP-NEXT: v_cvt_f32_u32_e32 v11, v1
-; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v4
-; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v1, vcc
+; CGP-NEXT: v_sub_i32_e32 v17, vcc, 0, v4
+; CGP-NEXT: v_subb_u32_e32 v18, vcc, 0, v1, vcc
; CGP-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11
; CGP-NEXT: v_rcp_iflag_f32_e32 v10, v10
; CGP-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10
; CGP-NEXT: v_mul_f32_e32 v11, 0x2f800000, v10
; CGP-NEXT: v_trunc_f32_e32 v12, v11
; CGP-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12
-; CGP-NEXT: v_cvt_u32_f32_e32 v13, v10
-; CGP-NEXT: v_cvt_u32_f32_e32 v16, v12
-; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12]
-; CGP-NEXT: v_mul_hi_u32 v17, v13, v10
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; CGP-NEXT: v_mul_lo_u32 v12, v16, v10
-; CGP-NEXT: v_mul_hi_u32 v10, v16, v10
-; CGP-NEXT: v_mul_lo_u32 v18, v13, v11
-; CGP-NEXT: v_mul_lo_u32 v19, v16, v11
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v18
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v17
-; CGP-NEXT: v_mul_hi_u32 v17, v13, v11
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v18, v12
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v19, v10
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v17
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17
-; CGP-NEXT: v_mul_hi_u32 v11, v16, v11
+; CGP-NEXT: v_cvt_u32_f32_e32 v16, v10
+; CGP-NEXT: v_cvt_u32_f32_e32 v19, v12
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v16, 0
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[11:12]
+; CGP-NEXT: v_mul_lo_u32 v11, v19, v10
+; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
+; CGP-NEXT: v_mul_hi_u32 v12, v16, v10
+; CGP-NEXT: v_mul_hi_u32 v10, v19, v10
+; CGP-NEXT: v_mul_lo_u32 v13, v16, v14
+; CGP-NEXT: v_mul_lo_u32 v15, v19, v14
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT: v_mul_hi_u32 v12, v16, v14
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12
; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v10
-; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v11, vcc
-; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12]
-; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v14
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v14, vcc
-; CGP-NEXT: v_xor_b32_e32 v12, v8, v14
-; CGP-NEXT: v_mul_lo_u32 v8, v16, v10
-; CGP-NEXT: v_mul_lo_u32 v15, v13, v11
-; CGP-NEXT: v_xor_b32_e32 v17, v9, v14
-; CGP-NEXT: v_mul_hi_u32 v9, v13, v10
-; CGP-NEXT: v_mul_hi_u32 v10, v16, v10
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT: v_mul_hi_u32 v13, v19, v14
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v10
+; CGP-NEXT: v_addc_u32_e32 v19, vcc, v19, v11, vcc
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v16, 0
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[11:12]
+; CGP-NEXT: v_ashrrev_i32_e32 v17, 31, v9
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v17
+; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
+; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v17, vcc
+; CGP-NEXT: v_xor_b32_e32 v15, v8, v17
+; CGP-NEXT: v_mul_lo_u32 v8, v19, v10
+; CGP-NEXT: v_mul_lo_u32 v11, v16, v14
+; CGP-NEXT: v_xor_b32_e32 v18, v9, v17
+; CGP-NEXT: v_mul_hi_u32 v9, v16, v10
+; CGP-NEXT: v_mul_hi_u32 v10, v19, v10
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v9, v16, v11
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8
-; CGP-NEXT: v_mul_hi_u32 v15, v13, v11
+; CGP-NEXT: v_mul_lo_u32 v9, v19, v14
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8
+; CGP-NEXT: v_mul_hi_u32 v11, v16, v14
; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v15
-; CGP-NEXT: v_mul_hi_u32 v11, v16, v11
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT: v_mul_hi_u32 v11, v19, v14
; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8
-; CGP-NEXT: v_addc_u32_e32 v9, vcc, v16, v9, vcc
-; CGP-NEXT: v_mul_lo_u32 v10, v17, v8
-; CGP-NEXT: v_mul_lo_u32 v11, v12, v9
-; CGP-NEXT: v_mul_hi_u32 v13, v12, v8
-; CGP-NEXT: v_mul_hi_u32 v8, v17, v8
-; CGP-NEXT: v_mul_hi_u32 v15, v17, v9
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v16, v8
+; CGP-NEXT: v_addc_u32_e32 v9, vcc, v19, v9, vcc
+; CGP-NEXT: v_mul_lo_u32 v10, v18, v8
+; CGP-NEXT: v_mul_lo_u32 v11, v15, v9
+; CGP-NEXT: v_mul_hi_u32 v12, v15, v8
+; CGP-NEXT: v_mul_hi_u32 v8, v18, v8
; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v17, v9
+; CGP-NEXT: v_mul_lo_u32 v12, v18, v9
; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT: v_mul_hi_u32 v11, v12, v9
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v11, v15, v9
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11
; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v8, v10
-; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v13, 0
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v8, v10
+; CGP-NEXT: v_mul_hi_u32 v12, v18, v9
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v14, 0
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v15, v10
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v11, v[9:10]
-; CGP-NEXT: v_sub_i32_e32 v8, vcc, v12, v8
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v1, v13, v[9:10]
-; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v17, v9, vcc
-; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v17, v9
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1
-; CGP-NEXT: v_subb_u32_e32 v9, vcc, v9, v1, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v12, v10
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v4, v16, v[9:10]
+; CGP-NEXT: v_sub_i32_e32 v8, vcc, v15, v8
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v1, v14, v[10:11]
+; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v18, v12, vcc
+; CGP-NEXT: v_sub_i32_e64 v10, s[4:5], v18, v12
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1
+; CGP-NEXT: v_subb_u32_e32 v10, vcc, v10, v1, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v4
; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v4
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v1
-; CGP-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v10, v12, v15, s[4:5]
-; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v13
-; CGP-NEXT: v_addc_u32_e32 v15, vcc, 0, v11, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v1
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v1
+; CGP-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v10, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v9, v11, v12, s[4:5]
+; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v14
+; CGP-NEXT: v_addc_u32_e32 v12, vcc, 0, v16, vcc
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v10, v1
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v1
-; CGP-NEXT: v_cndmask_b32_e32 v1, v16, v4, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v12
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v15, vcc
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v10, v1
+; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v4, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v11
+; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v12, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; CGP-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v4, v15, v8, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc
-; CGP-NEXT: v_xor_b32_e32 v8, v14, v0
-; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v4, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v4, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v4, v12, v8, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
+; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc
+; CGP-NEXT: v_xor_b32_e32 v8, v17, v0
+; CGP-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc
; CGP-NEXT: v_xor_b32_e32 v0, v1, v8
; CGP-NEXT: v_xor_b32_e32 v1, v4, v8
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
@@ -2313,128 +2313,128 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_xor_b32_e32 v3, v6, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v6, v4
; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3
-; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v4
-; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc
+; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v4
+; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc
; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v8
; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6
; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6
; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6
; CGP-NEXT: v_trunc_f32_e32 v10, v8
; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v10
-; CGP-NEXT: v_cvt_u32_f32_e32 v11, v6
-; CGP-NEXT: v_cvt_u32_f32_e32 v14, v10
-; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
+; CGP-NEXT: v_cvt_u32_f32_e32 v13, v6
+; CGP-NEXT: v_cvt_u32_f32_e32 v16, v10
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0
; CGP-NEXT: v_mov_b32_e32 v6, v9
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7]
-; CGP-NEXT: v_mul_lo_u32 v6, v14, v8
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10]
-; CGP-NEXT: v_mul_hi_u32 v10, v11, v8
-; CGP-NEXT: v_mul_hi_u32 v8, v14, v8
-; CGP-NEXT: v_mul_lo_u32 v15, v11, v9
-; CGP-NEXT: v_mul_lo_u32 v16, v14, v9
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v16, v[6:7]
+; CGP-NEXT: v_mul_lo_u32 v6, v16, v8
+; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10]
+; CGP-NEXT: v_mul_hi_u32 v9, v13, v8
+; CGP-NEXT: v_mul_hi_u32 v8, v16, v8
+; CGP-NEXT: v_mul_lo_u32 v10, v13, v11
+; CGP-NEXT: v_mul_lo_u32 v12, v16, v11
; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10
-; CGP-NEXT: v_mul_hi_u32 v10, v11, v9
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; CGP-NEXT: v_mul_hi_u32 v9, v13, v11
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v15, v6
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v16, v8
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10
-; CGP-NEXT: v_mul_hi_u32 v9, v14, v9
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT: v_mul_hi_u32 v10, v16, v11
; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8
; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v6
-; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v8, vcc
-; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v6
+; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v8, vcc
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0
; CGP-NEXT: v_mov_b32_e32 v6, v9
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7]
-; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10]
-; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v12, vcc
-; CGP-NEXT: v_xor_b32_e32 v10, v5, v12
-; CGP-NEXT: v_mul_lo_u32 v5, v14, v8
-; CGP-NEXT: v_mul_lo_u32 v7, v11, v9
-; CGP-NEXT: v_xor_b32_e32 v13, v6, v12
-; CGP-NEXT: v_mul_hi_u32 v6, v11, v8
-; CGP-NEXT: v_mul_hi_u32 v8, v14, v8
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v16, v[6:7]
+; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v7
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14
+; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10]
+; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v14, vcc
+; CGP-NEXT: v_xor_b32_e32 v12, v5, v14
+; CGP-NEXT: v_mul_lo_u32 v5, v16, v8
+; CGP-NEXT: v_mul_lo_u32 v7, v13, v11
+; CGP-NEXT: v_xor_b32_e32 v15, v6, v14
+; CGP-NEXT: v_mul_hi_u32 v6, v13, v8
+; CGP-NEXT: v_mul_hi_u32 v8, v16, v8
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v6, v14, v9
+; CGP-NEXT: v_mul_lo_u32 v6, v16, v11
; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT: v_mul_hi_u32 v7, v11, v9
+; CGP-NEXT: v_mul_hi_u32 v7, v13, v11
; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; CGP-NEXT: v_mul_hi_u32 v8, v14, v9
+; CGP-NEXT: v_mul_hi_u32 v8, v16, v11
; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5
-; CGP-NEXT: v_addc_u32_e32 v6, vcc, v14, v6, vcc
-; CGP-NEXT: v_mul_lo_u32 v7, v13, v5
-; CGP-NEXT: v_mul_lo_u32 v8, v10, v6
-; CGP-NEXT: v_mul_hi_u32 v9, v10, v5
-; CGP-NEXT: v_mul_hi_u32 v5, v13, v5
-; CGP-NEXT: v_mul_hi_u32 v11, v13, v6
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5
+; CGP-NEXT: v_addc_u32_e32 v6, vcc, v16, v6, vcc
+; CGP-NEXT: v_mul_lo_u32 v7, v15, v5
+; CGP-NEXT: v_mul_lo_u32 v8, v12, v6
+; CGP-NEXT: v_mul_hi_u32 v9, v12, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v15, v5
; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v9, v13, v6
+; CGP-NEXT: v_mul_lo_u32 v9, v15, v6
; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; CGP-NEXT: v_mul_hi_u32 v8, v10, v6
+; CGP-NEXT: v_mul_hi_u32 v8, v12, v6
; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v5, v7
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, 0
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v5, v7
+; CGP-NEXT: v_mul_hi_u32 v9, v15, v6
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, 0
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v7
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v8, v[6:7]
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v10, v5
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v9, v[6:7]
-; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v13, v6, vcc
-; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v13, v6
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3
-; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v9, v7
+; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v13, v[6:7]
+; CGP-NEXT: v_sub_i32_e32 v5, vcc, v12, v5
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v11, v[7:8]
+; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v15, v9, vcc
+; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v15, v9
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4
; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v4
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3
-; CGP-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[4:5]
-; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v9
-; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3
+; CGP-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v6, v8, v9, s[4:5]
+; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v11
+; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
-; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v10
-; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v11, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v5, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
-; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
-; CGP-NEXT: v_xor_b32_e32 v5, v12, v2
-; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v8
+; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; CGP-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc
+; CGP-NEXT: v_xor_b32_e32 v5, v14, v2
+; CGP-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc
; CGP-NEXT: v_xor_b32_e32 v2, v3, v5
; CGP-NEXT: v_xor_b32_e32 v3, v4, v5
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5
@@ -2504,15 +2504,15 @@ define i64 @v_sdiv_i64_24bit(i64 %num, i64 %den) {
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3
-; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0
; CGP-NEXT: v_rcp_f32_e32 v1, v1
; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v1
; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v3
-; CGP-NEXT: v_mul_lo_u32 v1, v1, v4
-; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v1, 0
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v2
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
+; CGP-NEXT: v_mul_lo_u32 v5, v1, v4
+; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v5, 0
+; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
; CGP-NEXT: v_mul_lo_u32 v0, v1, v3
; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v1
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v5, v0
@@ -2537,198 +2537,198 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4
; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1
-; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v9, 0
-; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v1
-; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v9
+; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v10, 0
+; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1
+; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v10
; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, vcc
; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3
; GISEL-NEXT: v_trunc_f32_e32 v5, v4
; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5
-; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v3
-; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5]
-; GISEL-NEXT: v_mul_hi_u32 v12, v7, v3
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3
-; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3
-; GISEL-NEXT: v_mul_lo_u32 v13, v7, v4
-; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; GISEL-NEXT: v_mul_hi_u32 v12, v7, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v3
+; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v9, 0
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[7:8]
+; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3
+; GISEL-NEXT: v_mul_hi_u32 v7, v9, v3
+; GISEL-NEXT: v_mul_lo_u32 v8, v9, v4
+; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3
+; GISEL-NEXT: v_mul_lo_u32 v14, v13, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_mul_hi_u32 v7, v9, v4
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v3
-; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3
-; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0
-; GISEL-NEXT: v_mul_lo_u32 v8, v7, v4
-; GISEL-NEXT: v_mul_hi_u32 v0, v7, v3
-; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v3
+; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v9, 0
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5]
+; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v9, v3
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[7:8]
+; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3
+; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3
+; GISEL-NEXT: v_mul_lo_u32 v7, v9, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v11, v4
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_mul_hi_u32 v8, v7, v4
+; GISEL-NEXT: v_mul_lo_u32 v5, v13, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0
+; GISEL-NEXT: v_mul_hi_u32 v7, v9, v4
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc
; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0
-; GISEL-NEXT: v_mul_lo_u32 v5, v10, v3
-; GISEL-NEXT: v_mul_hi_u32 v7, v10, v0
+; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3
+; GISEL-NEXT: v_mul_hi_u32 v7, v11, v0
; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0
-; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v2
+; GISEL-NEXT: v_and_b32_e32 v13, 0xffffff, v2
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; GISEL-NEXT: v_mul_lo_u32 v5, 0, v3
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3
+; GISEL-NEXT: v_mul_hi_u32 v7, v11, v3
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0
; GISEL-NEXT: v_mul_hi_u32 v4, 0, v3
; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0
+; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v3
; GISEL-NEXT: v_mov_b32_e32 v5, v8
-; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v3
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v4, v[5:6]
-; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v8
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v0, v[5:6]
-; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v10, v7
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v4, v[5:6]
+; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v10
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v12
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v0, v[8:9]
+; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v11, v7
; GISEL-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v2
; GISEL-NEXT: v_trunc_f32_e32 v8, v6
; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v8
-; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v2
-; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v3
-; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
-; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v8
+; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v2
+; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, v5, vcc
+; GISEL-NEXT: v_sub_i32_e64 v15, s[4:5], 0, v3
+; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v14, 0
+; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v8
; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5
; GISEL-NEXT: v_mov_b32_e32 v2, v7
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v15, v[2:3]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v15, v17, v[2:3]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v14, v12, v[7:8]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v8, -1, v2, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v2, v15, v6
-; GISEL-NEXT: v_mul_lo_u32 v10, v12, v7
-; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v5, vcc
-; GISEL-NEXT: v_mul_hi_u32 v5, v12, v6
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v14, v[7:8]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v2, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v2, v17, v6
+; GISEL-NEXT: v_mul_lo_u32 v7, v14, v9
+; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v5, vcc
+; GISEL-NEXT: v_mul_hi_u32 v5, v14, v6
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v15, v7
-; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2
-; GISEL-NEXT: v_mul_hi_u32 v10, v12, v7
+; GISEL-NEXT: v_mul_lo_u32 v5, v17, v9
+; GISEL-NEXT: v_mul_hi_u32 v6, v17, v6
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2
+; GISEL-NEXT: v_mul_hi_u32 v7, v14, v9
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT: v_mul_hi_u32 v7, v15, v7
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT: v_mul_hi_u32 v7, v17, v9
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v2
-; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v15, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v10, 0
-; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v9, v1
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v2
+; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v17, v5, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v12, 0
+; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v11, v1
; GISEL-NEXT: v_mov_b32_e32 v2, v6
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, v[2:3]
-; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v16, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v10, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v14, v[2:3]
+; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v8, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v12, v[6:7]
; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v0
-; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v4, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v1
-; GISEL-NEXT: v_mul_lo_u32 v7, v12, v5
-; GISEL-NEXT: v_mul_lo_u32 v9, v10, v6
-; GISEL-NEXT: v_mul_hi_u32 v14, v10, v5
+; GISEL-NEXT: v_mul_lo_u32 v6, v14, v5
+; GISEL-NEXT: v_mul_lo_u32 v7, v12, v8
+; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v4, vcc
+; GISEL-NEXT: v_mul_hi_u32 v9, v12, v5
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v1
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v14
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v14, v12, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
-; GISEL-NEXT: v_mul_hi_u32 v9, v10, v6
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v14, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v9, v14, v8
+; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT: v_mul_hi_u32 v7, v12, v8
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9
-; GISEL-NEXT: v_mul_hi_u32 v6, v12, v6
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5
-; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v12, v6, vcc
+; GISEL-NEXT: v_mul_hi_u32 v8, v14, v8
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v14, v6, vcc
; GISEL-NEXT: v_mul_lo_u32 v6, 0, v5
-; GISEL-NEXT: v_mul_lo_u32 v9, v11, v7
-; GISEL-NEXT: v_mul_hi_u32 v14, v11, v5
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v2
-; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v13, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT: v_mul_lo_u32 v9, 0, v7
+; GISEL-NEXT: v_mul_lo_u32 v8, v13, v7
+; GISEL-NEXT: v_mul_hi_u32 v12, v13, v5
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v2
+; GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v15, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT: v_mul_lo_u32 v8, 0, v7
; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14
-; GISEL-NEXT: v_mul_hi_u32 v14, v11, v7
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v13, v7
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v5, v6
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, 0
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v5, v6
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v12, 0
+; GISEL-NEXT: v_mul_hi_u32 v14, 0, v7
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
-; GISEL-NEXT: v_mul_hi_u32 v10, 0, v7
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc
; GISEL-NEXT: v_mov_b32_e32 v1, v6
-; GISEL-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v10, v[1:2]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v14, v[1:2]
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v11, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v12, v[6:7]
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v12, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v9, v[6:7]
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v5
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v6
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v6, vcc
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v13, v5
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v8
+; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v8, vcc
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v3
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
@@ -2736,8 +2736,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v9
-; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v12
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v14, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
@@ -2748,8 +2748,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_sdiv_v2i64_24bit:
@@ -2769,27 +2769,27 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0
; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v7
; CGP-NEXT: v_cvt_u32_f32_e32 v6, v0
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v1
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v5, 0
; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v4
-; CGP-NEXT: v_mul_lo_u32 v5, v1, v3
-; CGP-NEXT: v_mul_lo_u32 v0, v0, v6
+; CGP-NEXT: v_mul_lo_u32 v5, v0, v6
+; CGP-NEXT: v_mul_lo_u32 v0, v1, v3
; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v1
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3
+; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v0
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v3
; CGP-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0
-; CGP-NEXT: v_and_b32_e32 v8, 0xffffff, v2
-; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v5, v3
-; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v6, v1
-; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v1, 0
-; CGP-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0
+; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v2
+; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v8, v3
+; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v6, v1
+; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v6, 0
+; CGP-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v7
-; CGP-NEXT: v_mul_lo_u32 v5, v2, v4
+; CGP-NEXT: v_mul_lo_u32 v6, v2, v4
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3
; CGP-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc
; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; CGP-NEXT: v_sub_i32_e32 v3, vcc, v8, v5
+; CGP-NEXT: v_sub_i32_e32 v3, vcc, v5, v6
; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 1441591..f4489c2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -175,65 +175,65 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_trunc_f32_e32 v2, v1
; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
; GFX8-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v6, 0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s15, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v6, 0
+; GFX8-NEXT: v_mov_b32_e32 v8, s11
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s15, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
; GFX8-NEXT: v_mul_lo_u32 v2, s11, v0
; GFX8-NEXT: v_mul_lo_u32 v3, s10, v1
; GFX8-NEXT: v_mul_hi_u32 v4, s10, v0
; GFX8-NEXT: v_mul_hi_u32 v0, s11, v0
-; GFX8-NEXT: v_mul_hi_u32 v5, s11, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
@@ -246,36 +246,36 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2
+; GFX8-NEXT: v_mul_hi_u32 v4, s11, v1
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v6, s11
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2]
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s10, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v5, s9
-; GFX8-NEXT: v_subb_u32_e64 v2, s[0:1], v6, v1, vcc
-; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s11, v1
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NEXT: v_subb_u32_e64 v2, s[0:1], v8, v4, vcc
+; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s11, v4
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v2
-; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[0:1]
-; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s8, v0
-; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v4
-; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
+; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1]
+; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s8, v0
+; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v3, vcc
+; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v6
+; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v7, s[0:1]
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7
-; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v5
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8
-; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s8, v7
+; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v5
; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1]
; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9
; GFX8-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
@@ -283,20 +283,20 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v5, v0, v5, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v9, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v7, v10, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v0, v3, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[0:1]
; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[12:13]
; GFX8-NEXT: v_xor_b32_e32 v0, s0, v4
-; GFX8-NEXT: v_xor_b32_e32 v1, s1, v3
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_xor_b32_e32 v1, s1, v6
+; GFX8-NEXT: v_mov_b32_e32 v4, s1
; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; GFX8-NEXT: v_xor_b32_e32 v3, s2, v5
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
+; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3
; GFX8-NEXT: v_xor_b32_e32 v4, s2, v2
; GFX8-NEXT: v_mov_b32_e32 v5, s2
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s2, v3
@@ -312,6 +312,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-LABEL: sdivrem_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s2, s17, 31
; GFX9-NEXT: s_ashr_i32 s4, s19, 31
@@ -335,64 +336,63 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_trunc_f32_e32 v2, v1
; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
; GFX9-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, s7
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
+; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add_u32_e32 v1, v3, v1
+; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
; GFX9-NEXT: v_mul_lo_u32 v2, s9, v0
; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1
; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0
; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0
-; GFX9-NEXT: v_mul_hi_u32 v6, s9, v1
+; GFX9-NEXT: v_mul_hi_u32 v5, s9, v1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
@@ -400,67 +400,67 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_mul_lo_u32 v4, s9, v1
; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
; GFX9-NEXT: v_mul_hi_u32 v3, s8, v1
+; GFX9-NEXT: v_mov_b32_e32 v7, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v5, 0
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v6, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v6, s9
+; GFX9-NEXT: v_add3_u32 v8, v3, v2, v5
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s6, v8, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s8, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v5, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v6, v1, vcc
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v2
-; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s7, v6, v[2:3]
+; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v1, v4, vcc
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v1
+; GFX9-NEXT: v_sub_u32_e32 v2, s9, v4
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[0:1]
-; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s6, v0
-; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v1, vcc
-; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5
-; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v9
+; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v1
+; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1]
+; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v0
+; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v2, vcc
+; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v6
+; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v8, s[0:1]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v5
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v8
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v4
+; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v9
-; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s6, v8
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v5
+; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s6, v4
; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1]
; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10
-; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc
; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, v6, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[0:1]
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v10, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v11, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v4, v0, v4, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1]
; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5]
-; GFX9-NEXT: v_xor_b32_e32 v0, s0, v5
-; GFX9-NEXT: v_xor_b32_e32 v1, s1, v3
+; GFX9-NEXT: v_xor_b32_e32 v0, s0, v3
+; GFX9-NEXT: v_xor_b32_e32 v1, s1, v6
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT: v_xor_b32_e32 v3, s2, v6
-; GFX9-NEXT: v_xor_b32_e32 v5, s2, v2
-; GFX9-NEXT: v_mov_b32_e32 v6, s2
+; GFX9-NEXT: v_xor_b32_e32 v3, s2, v4
+; GFX9-NEXT: v_xor_b32_e32 v4, s2, v2
+; GFX9-NEXT: v_mov_b32_e32 v5, s2
; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v3
-; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13]
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[14:15]
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v5, vcc
+; GFX9-NEXT: global_store_dwordx2 v9, v[0:1], s[12:13]
+; GFX9-NEXT: global_store_dwordx2 v9, v[2:3], s[14:15]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sdivrem_i64:
@@ -1311,68 +1311,68 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_trunc_f32_e32 v2, v1
; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
; GFX8-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
+; GFX8-NEXT: v_mov_b32_e32 v8, s11
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4
; GFX8-NEXT: s_xor_b64 s[16:17], s[4:5], s[6:7]
; GFX8-NEXT: s_ashr_i32 s6, s19, 31
; GFX8-NEXT: s_mov_b32 s7, s6
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
; GFX8-NEXT: v_mul_lo_u32 v2, s11, v0
; GFX8-NEXT: v_mul_lo_u32 v3, s10, v1
; GFX8-NEXT: v_mul_hi_u32 v4, s10, v0
; GFX8-NEXT: v_mul_hi_u32 v0, s11, v0
-; GFX8-NEXT: v_mul_hi_u32 v5, s11, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
@@ -1385,38 +1385,38 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2
+; GFX8-NEXT: v_mul_hi_u32 v4, s11, v1
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v6, s11
-; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s10, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v5, s9
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2]
+; GFX8-NEXT: v_sub_u32_e32 v9, vcc, s10, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v1, s9
; GFX8-NEXT: s_ashr_i32 s10, s3, 31
-; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v1, vcc
-; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s11, v1
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7
-; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc
+; GFX8-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v4, vcc
+; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s11, v4
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6
-; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s8, v7
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1]
-; GFX8-NEXT: v_subbrev_u32_e64 v9, s[0:1], 0, v0, vcc
-; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], 1, v4
-; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v9
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9
+; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8
+; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s8, v9
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; GFX8-NEXT: v_subbrev_u32_e64 v10, s[0:1], 0, v0, vcc
+; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], 1, v6
+; GFX8-NEXT: v_addc_u32_e64 v5, s[0:1], 0, v7, s[0:1]
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v10
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v4
; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v9
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v10
; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1]
-; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v1
-; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
+; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v3
+; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v5, s[0:1]
; GFX8-NEXT: s_add_u32 s0, s18, s6
; GFX8-NEXT: s_addc_u32 s1, s19, s6
; GFX8-NEXT: s_add_u32 s2, s2, s10
@@ -1424,15 +1424,15 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: s_addc_u32 s3, s3, s10
; GFX8-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11]
; GFX8-NEXT: v_cvt_f32_u32_e32 v14, s3
-; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc
-; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s2
-; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s8, v8
+; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc
+; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s2
+; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s8, v4
; GFX8-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v0, vcc
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v14
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v1, v12, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc
; GFX8-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7]
; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
@@ -1441,151 +1441,151 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_add_f32_e32 v0, v1, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v0
; GFX8-NEXT: s_sub_u32 s5, 0, s2
-; GFX8-NEXT: s_subb_u32 s20, 0, s3
-; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc
+; GFX8-NEXT: v_cvt_u32_f32_e32 v11, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v12, 0
; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1]
-; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v11
-; GFX8-NEXT: v_cndmask_b32_e64 v10, v3, v10, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v15, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s5, v5, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v3, v5, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s20, v12, v[1:2]
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v9, v16, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1]
-; GFX8-NEXT: v_mul_lo_u32 v8, v12, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1]
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v11, v[1:2]
+; GFX8-NEXT: s_subb_u32 s20, 0, s3
+; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v12, v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v16, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v1, s[0:1]
+; GFX8-NEXT: v_mul_lo_u32 v1, v11, v0
+; GFX8-NEXT: v_mul_lo_u32 v3, v12, v4
+; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[0:1]
; GFX8-NEXT: v_mul_hi_u32 v2, v12, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v5, v0
-; GFX8-NEXT: v_xor_b32_e32 v9, s17, v10
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8
-; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v3, v5, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2
-; GFX8-NEXT: v_mul_hi_u32 v8, v12, v1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v11, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
-; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8
-; GFX8-NEXT: v_mul_hi_u32 v1, v5, v1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v12, v0
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v8, 0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; GFX8-NEXT: v_xor_b32_e32 v1, s16, v4
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v2, v11, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_mul_hi_u32 v3, v12, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
+; GFX8-NEXT: v_mul_hi_u32 v3, v11, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v12, v0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v10, 0
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v11, v1, vcc
+; GFX8-NEXT: v_xor_b32_e32 v1, s16, v6
; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v5, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v10, s17
+; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v11, v[0:1]
+; GFX8-NEXT: v_xor_b32_e32 v5, s17, v7
+; GFX8-NEXT: v_mov_b32_e32 v6, s17
; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s16, v1
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v8, v[3:4]
-; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v9, v10, vcc
-; GFX8-NEXT: v_xor_b32_e32 v4, s4, v7
-; GFX8-NEXT: v_mul_lo_u32 v7, v5, v2
-; GFX8-NEXT: v_mul_lo_u32 v9, v8, v3
-; GFX8-NEXT: v_mul_hi_u32 v11, v8, v2
-; GFX8-NEXT: v_mul_hi_u32 v2, v5, v2
-; GFX8-NEXT: v_xor_b32_e32 v6, s4, v6
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9
-; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v11
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v5, v6, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s20, v10, v[3:4]
+; GFX8-NEXT: v_mul_lo_u32 v4, v11, v2
+; GFX8-NEXT: v_xor_b32_e32 v3, s4, v9
+; GFX8-NEXT: v_mul_lo_u32 v7, v10, v5
+; GFX8-NEXT: v_mul_hi_u32 v9, v10, v2
+; GFX8-NEXT: v_mul_hi_u32 v2, v11, v2
+; GFX8-NEXT: v_xor_b32_e32 v6, s4, v8
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v11, v5, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7
-; GFX8-NEXT: v_mul_hi_u32 v9, v8, v3
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v11, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v9
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v9
+; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v9, v11, v5
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4
+; GFX8-NEXT: v_mul_hi_u32 v7, v10, v5
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2
; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, v11, v9
-; GFX8-NEXT: v_mul_hi_u32 v3, v5, v3
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
-; GFX8-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NEXT: v_mul_lo_u32 v7, s9, v2
-; GFX8-NEXT: v_mul_lo_u32 v8, s8, v3
-; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v10, vcc
-; GFX8-NEXT: v_mul_hi_u32 v6, s8, v2
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT: v_mul_hi_u32 v5, v11, v5
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
+; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v10, v2
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v11, v4, vcc
+; GFX8-NEXT: v_mul_lo_u32 v9, s9, v2
+; GFX8-NEXT: v_mul_lo_u32 v10, s8, v7
+; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v3
+; GFX8-NEXT: v_mul_hi_u32 v3, s8, v2
+; GFX8-NEXT: v_mov_b32_e32 v8, s4
+; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v8, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v9, v10
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v7, s9, v3
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v6, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v6, s9, v7
; GFX8-NEXT: v_mul_hi_u32 v2, s9, v2
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6
-; GFX8-NEXT: v_mul_hi_u32 v8, s8, v3
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v8, v3
+; GFX8-NEXT: v_mul_hi_u32 v8, s8, v7
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v8
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v2, v6
-; GFX8-NEXT: v_mul_hi_u32 v9, s9, v3
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v8, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
-; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v9, v[3:4]
-; GFX8-NEXT: v_mov_b32_e32 v10, s9
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v2, v3
+; GFX8-NEXT: v_mul_hi_u32 v7, s9, v7
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0
+; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, v7, v6
+; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4]
+; GFX8-NEXT: v_mov_b32_e32 v12, s9
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s8, v2
-; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v8, v[6:7]
+; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7]
; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: v_subb_u32_e64 v7, s[0:1], v10, v6, vcc
-; GFX8-NEXT: v_sub_u32_e64 v6, s[0:1], s9, v6
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7
-; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1]
+; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v12, v8, vcc
+; GFX8-NEXT: v_sub_u32_e64 v7, s[0:1], s9, v8
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v6
+; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v7
-; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[0:1]
-; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, s2, v2
-; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v6, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1]
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v6
+; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[0:1]
+; GFX8-NEXT: v_subrev_u32_e32 v9, vcc, s2, v2
+; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v7, vcc
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12
; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v11
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v9
; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12
; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1]
-; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v8
-; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc
-; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v9, s[0:1]
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v14
+; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v10
+; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc
+; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v11, s[0:1]
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 1, v14
; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s2, v11
+; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s2, v9
; GFX8-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v14, v7, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v10
-; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v8, v9, v14, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc
+; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v13, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v7, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v8, v11, v14, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1]
; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11]
-; GFX8-NEXT: v_xor_b32_e32 v2, s0, v6
+; GFX8-NEXT: v_xor_b32_e32 v2, s0, v7
; GFX8-NEXT: v_xor_b32_e32 v3, s1, v8
-; GFX8-NEXT: v_mov_b32_e32 v6, s1
+; GFX8-NEXT: v_mov_b32_e32 v7, s1
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v2
-; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc
-; GFX8-NEXT: v_xor_b32_e32 v6, s6, v9
-; GFX8-NEXT: v_xor_b32_e32 v7, s6, v7
-; GFX8-NEXT: v_mov_b32_e32 v8, s6
-; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s6, v6
-; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v8, vcc
+; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
+; GFX8-NEXT: v_xor_b32_e32 v7, s6, v9
+; GFX8-NEXT: v_xor_b32_e32 v8, s6, v6
+; GFX8-NEXT: v_mov_b32_e32 v9, s6
+; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s6, v7
+; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v8, v9, vcc
; GFX8-NEXT: v_mov_b32_e32 v8, s12
; GFX8-NEXT: v_mov_b32_e32 v9, s13
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
@@ -1622,66 +1622,67 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_trunc_f32_e32 v2, v1
; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
; GFX9-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
+; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add_u32_e32 v1, v3, v1
+; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
+; GFX9-NEXT: v_mov_b32_e32 v8, s11
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4
; GFX9-NEXT: s_xor_b64 s[16:17], s[4:5], s[6:7]
; GFX9-NEXT: s_ashr_i32 s6, s19, 31
; GFX9-NEXT: s_mov_b32 s7, s6
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1
-; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v6, v5
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
; GFX9-NEXT: v_mul_lo_u32 v2, s11, v0
; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1
; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0
; GFX9-NEXT: v_mul_hi_u32 v0, s11, v0
-; GFX9-NEXT: v_mul_hi_u32 v6, s11, v1
+; GFX9-NEXT: v_mul_hi_u32 v5, s11, v1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
@@ -1693,51 +1694,50 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v6, s11
-; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, s10, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v4, s9
+; GFX9-NEXT: v_add3_u32 v7, v3, v2, v5
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2]
+; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s10, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: s_ashr_i32 s10, s3, 31
-; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc
-; GFX9-NEXT: v_sub_u32_e32 v0, s11, v1
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7
-; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v8, v4, vcc
+; GFX9-NEXT: v_sub_u32_e32 v0, s11, v4
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6
-; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s8, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v2, s[0:1]
-; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[0:1], 0, v0, vcc
-; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v5
-; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v10
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9
+; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8
+; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s8, v9
+; GFX9-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[0:1]
+; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v0, vcc
+; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v6
+; GFX9-NEXT: v_addc_co_u32_e64 v3, s[0:1], 0, v7, s[0:1]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v11
+; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v10
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v10
-; GFX9-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[0:1]
-; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v2
-; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v11
+; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[0:1]
+; GFX9-NEXT: v_add_co_u32_e64 v12, s[0:1], 1, v2
+; GFX9-NEXT: v_addc_co_u32_e64 v13, s[0:1], 0, v3, s[0:1]
; GFX9-NEXT: s_add_u32 s0, s18, s6
; GFX9-NEXT: s_addc_u32 s1, s19, s6
; GFX9-NEXT: s_add_u32 s2, s2, s10
; GFX9-NEXT: s_mov_b32 s11, s10
; GFX9-NEXT: s_addc_u32 s3, s3, s10
; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11]
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3
+; GFX9-NEXT: v_cvt_f32_u32_e32 v14, s3
; GFX9-NEXT: v_cvt_f32_u32_e32 v15, s2
-; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
-; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1
+; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc
+; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v14
; GFX9-NEXT: v_add_f32_e32 v1, v1, v15
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s8, v9
+; GFX9-NEXT: v_subrev_co_u32_e32 v14, vcc, s8, v10
; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1
; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
@@ -1747,31 +1747,31 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v0
; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7]
; GFX9-NEXT: s_sub_u32 s5, 0, s2
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v17, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v12, v2, v13, vcc
-; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v16
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v12, vcc
+; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v16
+; GFX9-NEXT: v_cndmask_b32_e32 v13, v3, v13, vcc
; GFX9-NEXT: s_subb_u32 s20, 0, s3
-; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v13, v[1:2]
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8
-; GFX9-NEXT: v_cndmask_b32_e64 v8, v3, v11, s[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s20, v17, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v13, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v15, vcc
-; GFX9-NEXT: v_mul_lo_u32 v3, v17, v1
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v12, v[1:2]
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v6, v5, s[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v17, v[2:3]
+; GFX9-NEXT: v_mul_lo_u32 v2, v12, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v14, vcc
+; GFX9-NEXT: v_mul_lo_u32 v3, v17, v4
; GFX9-NEXT: v_mul_hi_u32 v10, v17, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v13, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v13, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v15, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v10
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v10, v13, v1
+; GFX9-NEXT: v_mul_lo_u32 v10, v12, v4
+; GFX9-NEXT: v_mul_hi_u32 v0, v12, v0
; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT: v_mul_hi_u32 v3, v17, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v13, v1
+; GFX9-NEXT: v_mul_hi_u32 v3, v17, v4
+; GFX9-NEXT: v_mul_hi_u32 v4, v12, v4
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
@@ -1779,119 +1779,119 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_add_u32_e32 v3, v10, v3
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v2, v3, v2, v4
; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v17, v0
-; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1
+; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v12, v2, vcc
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v10, 0
-; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v13, v1, vcc
+; GFX9-NEXT: v_xor_b32_e32 v1, s16, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, v3
-; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v4, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v11, v[0:1]
-; GFX9-NEXT: v_xor_b32_e32 v5, s16, v5
-; GFX9-NEXT: v_xor_b32_e32 v8, s17, v8
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v10, v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v9, s17
-; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v5
-; GFX9-NEXT: v_xor_b32_e32 v4, s4, v7
-; GFX9-NEXT: v_mul_lo_u32 v5, v11, v2
-; GFX9-NEXT: v_mul_lo_u32 v7, v10, v3
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v8, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v11, v[0:1]
+; GFX9-NEXT: v_xor_b32_e32 v8, s17, v5
+; GFX9-NEXT: v_mov_b32_e32 v12, s17
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s20, v10, v[3:4]
+; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v1
+; GFX9-NEXT: v_mul_lo_u32 v3, v11, v2
+; GFX9-NEXT: v_mul_lo_u32 v6, v10, v5
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v8, v12, vcc
; GFX9-NEXT: v_mul_hi_u32 v8, v10, v2
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v8, v11, v3
+; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v8, v11, v5
; GFX9-NEXT: v_mul_hi_u32 v2, v11, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT: v_mul_hi_u32 v7, v10, v3
-; GFX9-NEXT: v_mul_hi_u32 v3, v11, v3
+; GFX9-NEXT: v_add_u32_e32 v3, v6, v3
+; GFX9-NEXT: v_mul_hi_u32 v6, v10, v5
+; GFX9-NEXT: v_mul_hi_u32 v5, v11, v5
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT: v_add_u32_e32 v7, v8, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v3, v7, v5, v3
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
+; GFX9-NEXT: v_add_u32_e32 v6, v8, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v3, v6, v3, v5
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v11, v3, vcc
; GFX9-NEXT: v_mul_lo_u32 v5, s9, v2
-; GFX9-NEXT: v_mul_lo_u32 v7, s8, v3
+; GFX9-NEXT: v_mul_lo_u32 v6, s8, v3
+; GFX9-NEXT: v_xor_b32_e32 v4, s4, v9
; GFX9-NEXT: v_mul_hi_u32 v9, s8, v2
; GFX9-NEXT: v_mul_hi_u32 v2, s9, v2
-; GFX9-NEXT: v_mul_hi_u32 v12, s9, v3
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX9-NEXT: v_mul_lo_u32 v9, s9, v3
-; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT: v_mul_hi_u32 v7, s8, v3
-; GFX9-NEXT: v_xor_b32_e32 v6, s4, v6
+; GFX9-NEXT: v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT: v_mul_hi_u32 v6, s8, v3
+; GFX9-NEXT: v_mul_hi_u32 v12, s9, v3
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v2, v5
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0
-; GFX9-NEXT: v_mov_b32_e32 v8, s4
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX9-NEXT: v_add_u32_e32 v6, v9, v6
+; GFX9-NEXT: v_xor_b32_e32 v7, s4, v7
+; GFX9-NEXT: v_mov_b32_e32 v8, s4
; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s4, v4
-; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v8, vcc
-; GFX9-NEXT: v_add_u32_e32 v6, v9, v7
-; GFX9-NEXT: v_add3_u32 v8, v6, v11, v12
-; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v8, v[3:4]
-; GFX9-NEXT: v_mov_b32_e32 v9, s9
+; GFX9-NEXT: v_add3_u32 v11, v6, v11, v12
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v8, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4]
+; GFX9-NEXT: v_mov_b32_e32 v12, s9
; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v2
-; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v10, v[6:7]
+; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7]
; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: v_subb_co_u32_e64 v7, s[0:1], v9, v6, vcc
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7
-; GFX9-NEXT: v_sub_u32_e32 v6, s9, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1]
+; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v12, v8, vcc
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v6
+; GFX9-NEXT: v_sub_u32_e32 v7, s9, v8
+; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v7
-; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[0:1]
-; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s2, v2
-; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v6, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v6
+; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[0:1]
+; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s2, v2
+; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v7, vcc
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v11
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v9
; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12
; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1]
; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v10
-; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v6, v3, vcc
-; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v8, s[0:1]
-; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 1, v14
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc
+; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v11, s[0:1]
+; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v14
; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v15, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v14, v7, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc
-; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s2, v11
+; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s2, v9
; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v15, vcc
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v6, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v14, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v7, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v11, v14, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1]
; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11]
-; GFX9-NEXT: v_xor_b32_e32 v2, s0, v6
+; GFX9-NEXT: v_xor_b32_e32 v2, s0, v7
; GFX9-NEXT: v_xor_b32_e32 v3, s1, v8
-; GFX9-NEXT: v_mov_b32_e32 v6, s1
+; GFX9-NEXT: v_mov_b32_e32 v7, s1
; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v6, vcc
-; GFX9-NEXT: v_xor_b32_e32 v6, s6, v9
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc
+; GFX9-NEXT: v_xor_b32_e32 v7, s6, v9
; GFX9-NEXT: v_mov_b32_e32 v13, 0
-; GFX9-NEXT: v_xor_b32_e32 v7, s6, v7
-; GFX9-NEXT: v_mov_b32_e32 v8, s6
-; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v6
-; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v8, vcc
+; GFX9-NEXT: v_xor_b32_e32 v8, s6, v6
+; GFX9-NEXT: v_mov_b32_e32 v9, s6
+; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v7
+; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v8, v9, vcc
; GFX9-NEXT: global_store_dwordx4 v13, v[0:3], s[12:13]
; GFX9-NEXT: global_store_dwordx4 v13, v[4:7], s[14:15]
; GFX9-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 40b5db0..6f42239 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -31,128 +31,128 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
; CHECK-NEXT: v_xor_b32_e32 v1, v2, v1
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v0
; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v1
-; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v0
-; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc
+; CHECK-NEXT: v_sub_i32_e32 v11, vcc, 0, v0
+; CHECK-NEXT: v_subb_u32_e32 v12, vcc, 0, v1, vcc
; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; CHECK-NEXT: v_trunc_f32_e32 v6, v3
; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v6
-; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v6
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4]
-; CHECK-NEXT: v_mul_lo_u32 v3, v11, v2
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT: v_mul_hi_u32 v7, v8, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2
-; CHECK-NEXT: v_mul_lo_u32 v12, v8, v6
-; CHECK-NEXT: v_mul_lo_u32 v13, v11, v6
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12
-; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v2
+; CHECK-NEXT: v_cvt_u32_f32_e32 v13, v6
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[3:4]
+; CHECK-NEXT: v_mul_lo_u32 v3, v13, v2
+; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7]
+; CHECK-NEXT: v_mul_hi_u32 v6, v10, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v13, v2
+; CHECK-NEXT: v_mul_lo_u32 v7, v10, v8
+; CHECK-NEXT: v_mul_lo_u32 v9, v13, v8
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT: v_mul_hi_u32 v6, v10, v8
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v12, v3
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v13, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7
-; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT: v_mul_hi_u32 v7, v13, v8
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2
-; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v3, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4]
-; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v5
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v9
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v9, vcc
-; CHECK-NEXT: v_xor_b32_e32 v5, v3, v9
-; CHECK-NEXT: v_mul_lo_u32 v3, v11, v2
-; CHECK-NEXT: v_mul_lo_u32 v7, v8, v6
-; CHECK-NEXT: v_xor_b32_e32 v10, v4, v9
-; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v2
+; CHECK-NEXT: v_addc_u32_e32 v13, vcc, v13, v3, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[3:4]
+; CHECK-NEXT: v_ashrrev_i32_e32 v11, 31, v5
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v11
+; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7]
+; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v11, vcc
+; CHECK-NEXT: v_xor_b32_e32 v9, v3, v11
+; CHECK-NEXT: v_mul_lo_u32 v3, v13, v2
+; CHECK-NEXT: v_mul_lo_u32 v5, v10, v8
+; CHECK-NEXT: v_xor_b32_e32 v12, v4, v11
+; CHECK-NEXT: v_mul_hi_u32 v4, v10, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v13, v2
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v4, v11, v6
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6
+; CHECK-NEXT: v_mul_lo_u32 v4, v13, v8
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT: v_mul_hi_u32 v5, v10, v8
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7
-; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CHECK-NEXT: v_mul_hi_u32 v5, v13, v8
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2
-; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc
-; CHECK-NEXT: v_mul_lo_u32 v4, v10, v2
-; CHECK-NEXT: v_mul_lo_u32 v6, v5, v3
-; CHECK-NEXT: v_mul_hi_u32 v7, v5, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
-; CHECK-NEXT: v_mul_hi_u32 v8, v10, v3
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2
+; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc
+; CHECK-NEXT: v_mul_lo_u32 v4, v12, v2
+; CHECK-NEXT: v_mul_lo_u32 v5, v9, v3
+; CHECK-NEXT: v_mul_hi_u32 v6, v9, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v12, v2
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v7, v10, v3
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT: v_mul_hi_u32 v6, v5, v3
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT: v_mul_lo_u32 v6, v12, v3
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT: v_mul_hi_u32 v5, v9, v3
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v2, v4
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v7, 0
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v2, v4
+; CHECK-NEXT: v_mul_hi_u32 v6, v12, v3
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v8, 0
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4]
-; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v7, v[3:4]
-; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc
-; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v4
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v6, v[3:4]
+; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v9, v2
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[4:5]
+; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v6, vcc
+; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v6
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1
-; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v1
+; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc
; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5]
; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v2, v0
-; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc
+; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v4, vcc
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1
; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v6, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5]
; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v11
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v11
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v11
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v11, vcc
; CHECK-NEXT: ; implicit-def: $vgpr2
; CHECK-NEXT: ; implicit-def: $vgpr4
; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
@@ -214,65 +214,65 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; CHECK-NEXT: v_trunc_f32_e32 v2, v1
; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v0
-; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v2
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT: v_mul_hi_u32 v5, v3, v0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
-; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT: v_mul_lo_u32 v6, v3, v1
-; CHECK-NEXT: v_mul_lo_u32 v7, v4, v1
-; CHECK-NEXT: v_mul_hi_u32 v8, v3, v1
-; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v0
+; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v2
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2]
+; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3]
+; CHECK-NEXT: v_mul_hi_u32 v2, v6, v0
+; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0
+; CHECK-NEXT: v_mul_lo_u32 v3, v6, v4
+; CHECK-NEXT: v_mul_lo_u32 v5, v7, v4
+; CHECK-NEXT: v_mul_hi_u32 v8, v6, v4
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v0
+; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2]
+; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3]
+; CHECK-NEXT: v_mul_hi_u32 v3, v6, v0
+; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0
+; CHECK-NEXT: v_mul_lo_u32 v2, v6, v4
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v0
-; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT: v_mul_hi_u32 v6, v3, v0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
-; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT: v_mul_lo_u32 v5, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v6, v4, v1
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT: v_mul_hi_u32 v5, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v3, v7, v4
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT: v_mul_hi_u32 v2, v6, v4
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
; CHECK-NEXT: v_mul_lo_u32 v2, s11, v0
; CHECK-NEXT: v_mul_lo_u32 v3, s10, v1
; CHECK-NEXT: v_mul_hi_u32 v4, s10, v0
; CHECK-NEXT: v_mul_hi_u32 v0, s11, v0
-; CHECK-NEXT: v_mul_hi_u32 v5, s11, v1
+; CHECK-NEXT: v_mov_b32_e32 v7, s11
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
@@ -285,19 +285,19 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v0, v2
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v0, v2
+; CHECK-NEXT: v_mul_hi_u32 v4, s11, v1
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v2, v[1:2]
-; CHECK-NEXT: v_mov_b32_e32 v5, s11
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v2
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v4, v[1:2]
+; CHECK-NEXT: v_mov_b32_e32 v1, s9
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, s10, v0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
-; CHECK-NEXT: v_mov_b32_e32 v3, s9
-; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v5, v1, vcc
-; CHECK-NEXT: v_sub_i32_e64 v1, s[0:1], s11, v1
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3]
+; CHECK-NEXT: v_sub_i32_e64 v3, s[0:1], s11, v4
+; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v7, v4, vcc
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2
; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
@@ -372,84 +372,84 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8
; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v5
; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8
-; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v5
-; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v8, vcc
+; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v5
+; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v8, vcc
; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v9
; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v4
; GISEL-NEXT: v_trunc_f32_e32 v11, v9
; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v11
-; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v11
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v12, 0
+; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v11
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v14, 0
; GISEL-NEXT: v_mov_b32_e32 v4, v10
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v4, v15, v9
-; GISEL-NEXT: v_mul_hi_u32 v16, v12, v9
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11]
-; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9
-; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v17, v[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v4, v17, v9
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[10:11]
+; GISEL-NEXT: v_mul_hi_u32 v11, v14, v9
+; GISEL-NEXT: v_mul_hi_u32 v9, v17, v9
+; GISEL-NEXT: v_mul_lo_u32 v10, v14, v12
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v16
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v16, v15, v10
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4
-; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT: v_mul_lo_u32 v11, v17, v12
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4
+; GISEL-NEXT: v_mul_hi_u32 v10, v14, v12
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11
-; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT: v_mul_hi_u32 v11, v17, v12
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v4
-; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v15, v9, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v12, 0
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v4
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v9, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v14, 0
; GISEL-NEXT: v_mov_b32_e32 v4, v10
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v17, v[4:5]
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11]
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[10:11]
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
; GISEL-NEXT: v_xor_b32_e32 v13, v0, v4
-; GISEL-NEXT: v_mul_lo_u32 v0, v15, v9
-; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10
-; GISEL-NEXT: v_xor_b32_e32 v14, v1, v4
-; GISEL-NEXT: v_mul_hi_u32 v1, v12, v9
-; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v0, v17, v9
+; GISEL-NEXT: v_mul_lo_u32 v10, v14, v12
+; GISEL-NEXT: v_xor_b32_e32 v15, v1, v4
+; GISEL-NEXT: v_mul_hi_u32 v1, v14, v9
+; GISEL-NEXT: v_mul_hi_u32 v9, v17, v9
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v1, v15, v10
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10
+; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT: v_mul_hi_u32 v10, v14, v12
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT: v_mul_hi_u32 v10, v17, v12
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v15, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v9, v14, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v9, v15, v0
; GISEL-NEXT: v_mul_lo_u32 v10, v13, v1
; GISEL-NEXT: v_mul_hi_u32 v11, v13, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v11, v14, v1
+; GISEL-NEXT: v_mul_lo_u32 v11, v15, v1
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; GISEL-NEXT: v_mul_hi_u32 v10, v13, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
@@ -457,148 +457,148 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_mul_hi_u32 v1, v14, v1
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v9
+; GISEL-NEXT: v_mul_hi_u32 v1, v15, v1
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v0, v9
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v1, v0
-; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v7
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v10, vcc
-; GISEL-NEXT: v_xor_b32_e32 v6, v6, v10
-; GISEL-NEXT: v_xor_b32_e32 v7, v7, v10
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v1, v0
+; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v7
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc
+; GISEL-NEXT: v_xor_b32_e32 v6, v6, v9
+; GISEL-NEXT: v_xor_b32_e32 v7, v7, v9
; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6
-; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v7
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, 0
-; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6
-; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v15
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v9, v[1:2]
+; GISEL-NEXT: v_cvt_f32_u32_e32 v16, v7
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0
+; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v16
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v11, v[1:2]
; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v12
-; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v8, v11, v[9:10]
+; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v14, v[9:10]
; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v1
-; GISEL-NEXT: v_trunc_f32_e32 v12, v10
+; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v1
+; GISEL-NEXT: v_trunc_f32_e32 v12, v9
; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v12
-; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v1
-; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12
-; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v13, v0
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0
-; GISEL-NEXT: v_mov_b32_e32 v0, v11
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v12, v[0:1]
-; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], v14, v9, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v15, v[0:1]
-; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v14, v9
-; GISEL-NEXT: v_mul_lo_u32 v9, v12, v10
-; GISEL-NEXT: v_mul_lo_u32 v14, v15, v0
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v11, v8
+; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v1
+; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v12
+; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v14, 0
+; GISEL-NEXT: v_sub_i32_e32 v19, vcc, v13, v0
+; GISEL-NEXT: v_mov_b32_e32 v0, v10
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v18, v[0:1]
+; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], v15, v11, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v14, v[12:13]
+; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v15, v11
+; GISEL-NEXT: v_mul_lo_u32 v11, v18, v9
+; GISEL-NEXT: v_mul_lo_u32 v12, v14, v0
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v8
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
-; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14
-; GISEL-NEXT: v_mul_hi_u32 v14, v15, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_hi_u32 v10, v12, v10
-; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v13, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v11, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[6:7]
-; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v13, v5
-; GISEL-NEXT: v_subbrev_u32_e64 v19, s[6:7], 0, v1, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v14, v5
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v19, v8
+; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v14, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v9, v18, v9
+; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[6:7]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v19, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v10, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[6:7]
+; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v19, v5
+; GISEL-NEXT: v_subbrev_u32_e64 v15, s[6:7], 0, v1, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v5
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v8
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9]
; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v8
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v14, v5
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v15, v8
+; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v12, v5
; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7]
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v8, v19, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v12, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v1, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v14, v12, v0
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v18, v1
-; GISEL-NEXT: v_mul_hi_u32 v18, v15, v0
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18
-; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v1
-; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v0, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v8, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v12, v[1:2]
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc
+; GISEL-NEXT: v_mul_lo_u32 v12, v18, v0
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT: v_mul_hi_u32 v13, v14, v0
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v1
+; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v18, v0, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v12, 0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
+; GISEL-NEXT: v_cndmask_b32_e32 v14, v10, v8, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v13, v[1:2]
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc
; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v10, v[8:9]
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v12, v[8:9]
; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
-; GISEL-NEXT: v_xor_b32_e32 v13, v2, v5
-; GISEL-NEXT: v_mul_lo_u32 v2, v12, v0
-; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8
-; GISEL-NEXT: v_xor_b32_e32 v14, v3, v5
-; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_xor_b32_e32 v11, v2, v5
+; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0
+; GISEL-NEXT: v_mul_lo_u32 v8, v12, v10
+; GISEL-NEXT: v_xor_b32_e32 v15, v3, v5
+; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v12, v8
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2
-; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8
+; GISEL-NEXT: v_mul_lo_u32 v3, v13, v10
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2
+; GISEL-NEXT: v_mul_hi_u32 v8, v12, v10
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
-; GISEL-NEXT: v_mul_hi_u32 v8, v12, v8
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8
+; GISEL-NEXT: v_mul_hi_u32 v8, v13, v10
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v12, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0
-; GISEL-NEXT: v_mul_lo_u32 v8, v13, v2
-; GISEL-NEXT: v_mul_hi_u32 v9, v13, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
-; GISEL-NEXT: v_xor_b32_e32 v10, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0
+; GISEL-NEXT: v_mul_lo_u32 v8, v11, v2
+; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0
+; GISEL-NEXT: v_xor_b32_e32 v10, v14, v4
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v9, v14, v2
+; GISEL-NEXT: v_mul_lo_u32 v9, v15, v2
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3
-; GISEL-NEXT: v_mul_hi_u32 v8, v13, v2
+; GISEL-NEXT: v_mul_hi_u32 v8, v11, v2
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3
-; GISEL-NEXT: v_mul_hi_u32 v9, v14, v2
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v3
+; GISEL-NEXT: v_mul_hi_u32 v9, v15, v2
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v12, 0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v0
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v9, v0
; GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v8, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v13, v[0:1]
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v11, v[8:9]
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v13, v2
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc
-; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v12, v[8:9]
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2
+; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc
+; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v7
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6
@@ -651,128 +651,128 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_xor_b32_e32 v1, v2, v1
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v0
; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1
-; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v0
-; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc
+; CGP-NEXT: v_sub_i32_e32 v15, vcc, 0, v0
+; CGP-NEXT: v_subb_u32_e32 v16, vcc, 0, v1, vcc
; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; CGP-NEXT: v_trunc_f32_e32 v4, v3
; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v5, v2
-; CGP-NEXT: v_cvt_u32_f32_e32 v14, v4
-; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4]
-; CGP-NEXT: v_mul_hi_u32 v15, v5, v2
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4]
-; CGP-NEXT: v_mul_lo_u32 v4, v14, v2
-; CGP-NEXT: v_mul_hi_u32 v2, v14, v2
-; CGP-NEXT: v_mul_lo_u32 v16, v5, v3
-; CGP-NEXT: v_mul_lo_u32 v17, v14, v3
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v16
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15
-; CGP-NEXT: v_mul_hi_u32 v15, v5, v3
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v16, v4
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v17, v2
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15
-; CGP-NEXT: v_mul_hi_u32 v3, v14, v3
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
+; CGP-NEXT: v_cvt_u32_f32_e32 v14, v2
+; CGP-NEXT: v_cvt_u32_f32_e32 v17, v4
+; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v15, v14, 0
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v17, v[3:4]
+; CGP-NEXT: v_mul_lo_u32 v3, v17, v2
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5]
+; CGP-NEXT: v_mul_hi_u32 v4, v14, v2
+; CGP-NEXT: v_mul_hi_u32 v2, v17, v2
+; CGP-NEXT: v_mul_lo_u32 v5, v14, v12
+; CGP-NEXT: v_mul_lo_u32 v13, v17, v12
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v2
-; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v3, vcc
-; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4]
-; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v11
-; CGP-NEXT: v_mul_hi_u32 v15, v5, v2
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4]
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v12
-; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v12, vcc
-; CGP-NEXT: v_xor_b32_e32 v11, v4, v12
-; CGP-NEXT: v_mul_lo_u32 v4, v14, v2
-; CGP-NEXT: v_mul_lo_u32 v13, v5, v3
-; CGP-NEXT: v_mul_hi_u32 v2, v14, v2
-; CGP-NEXT: v_xor_b32_e32 v10, v10, v12
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v15, v14, v3
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_mul_hi_u32 v13, v5, v3
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v15, v2
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; CGP-NEXT: v_mul_hi_u32 v3, v14, v3
+; CGP-NEXT: v_mul_hi_u32 v4, v14, v12
+; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; CGP-NEXT: v_mul_hi_u32 v5, v17, v12
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v2
+; CGP-NEXT: v_addc_u32_e32 v17, vcc, v17, v3, vcc
+; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v15, v14, 0
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v17, v[3:4]
+; CGP-NEXT: v_ashrrev_i32_e32 v15, 31, v11
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v15
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5]
+; CGP-NEXT: v_addc_u32_e32 v4, vcc, v11, v15, vcc
+; CGP-NEXT: v_xor_b32_e32 v13, v3, v15
+; CGP-NEXT: v_mul_lo_u32 v3, v17, v2
+; CGP-NEXT: v_mul_lo_u32 v5, v14, v12
+; CGP-NEXT: v_xor_b32_e32 v16, v4, v15
+; CGP-NEXT: v_mul_hi_u32 v4, v14, v2
+; CGP-NEXT: v_mul_hi_u32 v2, v17, v2
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT: v_addc_u32_e32 v3, vcc, v14, v3, vcc
-; CGP-NEXT: v_mul_lo_u32 v4, v10, v2
-; CGP-NEXT: v_mul_lo_u32 v5, v11, v3
-; CGP-NEXT: v_mul_hi_u32 v13, v11, v2
-; CGP-NEXT: v_mul_hi_u32 v2, v10, v2
-; CGP-NEXT: v_mul_hi_u32 v14, v10, v3
+; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v4, v17, v12
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT: v_mul_hi_u32 v5, v14, v12
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2
+; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v17, v12
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v14, v2
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, v17, v3, vcc
+; CGP-NEXT: v_mul_lo_u32 v4, v16, v2
+; CGP-NEXT: v_mul_lo_u32 v5, v13, v3
+; CGP-NEXT: v_mul_hi_u32 v10, v13, v2
+; CGP-NEXT: v_mul_hi_u32 v2, v16, v2
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v10, v3
+; CGP-NEXT: v_mul_lo_u32 v10, v16, v3
; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CGP-NEXT: v_mul_hi_u32 v5, v11, v3
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v5, v13, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v2
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v2, v4
-; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v13, 0
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v2, v4
+; CGP-NEXT: v_mul_hi_u32 v10, v16, v3
+; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v12, 0
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4]
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v11, v2
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v13, v[3:4]
-; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc
-; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v4
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v10, v[3:4]
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v13, v2
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v1, v12, v[4:5]
+; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v16, v10, vcc
+; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v16, v10
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v1
+; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc
; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[4:5]
; CGP-NEXT: v_sub_i32_e32 v10, vcc, v2, v0
-; CGP-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v3, vcc
+; CGP-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v4, vcc
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v0
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v1
-; CGP-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CGP-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v10, v0
-; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[4:5]
; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
; CGP-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
-; CGP-NEXT: v_xor_b32_e32 v0, v0, v12
-; CGP-NEXT: v_xor_b32_e32 v1, v1, v12
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v12
-; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CGP-NEXT: v_xor_b32_e32 v0, v0, v15
+; CGP-NEXT: v_xor_b32_e32 v1, v1, v15
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v15
+; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v15, vcc
; CGP-NEXT: ; implicit-def: $vgpr4
; CGP-NEXT: ; implicit-def: $vgpr10
; CGP-NEXT: .LBB2_2: ; %Flow1
@@ -820,128 +820,128 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_xor_b32_e32 v3, v4, v3
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v5, v3
-; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v2
-; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v3, vcc
+; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v2
+; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v3, vcc
; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4
; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
; CGP-NEXT: v_trunc_f32_e32 v6, v5
; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6
-; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v12, v6
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6]
-; CGP-NEXT: v_mul_hi_u32 v13, v7, v4
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6]
-; CGP-NEXT: v_mul_lo_u32 v6, v12, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v12, v4
-; CGP-NEXT: v_mul_lo_u32 v14, v7, v5
-; CGP-NEXT: v_mul_lo_u32 v15, v12, v5
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13
-; CGP-NEXT: v_mul_hi_u32 v13, v7, v5
-; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v14, v6
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_mul_hi_u32 v5, v12, v5
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6
-; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v13, v6
+; CGP-NEXT: v_cvt_u32_f32_e32 v12, v4
+; CGP-NEXT: v_cvt_u32_f32_e32 v15, v6
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v12, 0
+; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[5:6]
+; CGP-NEXT: v_mul_lo_u32 v5, v15, v4
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7]
+; CGP-NEXT: v_mul_hi_u32 v6, v12, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v15, v4
+; CGP-NEXT: v_mul_lo_u32 v7, v12, v10
+; CGP-NEXT: v_mul_lo_u32 v11, v15, v10
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v4
-; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6]
-; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v9
-; CGP-NEXT: v_mul_hi_u32 v13, v7, v4
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6]
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v10
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v10, vcc
-; CGP-NEXT: v_xor_b32_e32 v9, v6, v10
-; CGP-NEXT: v_mul_lo_u32 v6, v12, v4
-; CGP-NEXT: v_mul_lo_u32 v11, v7, v5
-; CGP-NEXT: v_mul_hi_u32 v4, v12, v4
-; CGP-NEXT: v_xor_b32_e32 v8, v8, v10
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13
-; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v12, v5
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6
-; CGP-NEXT: v_mul_hi_u32 v11, v7, v5
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11
-; CGP-NEXT: v_mul_hi_u32 v5, v12, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v12, v10
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CGP-NEXT: v_mul_hi_u32 v7, v15, v10
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v4
+; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v5, vcc
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v12, 0
+; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[5:6]
+; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v9
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v13
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7]
+; CGP-NEXT: v_addc_u32_e32 v6, vcc, v9, v13, vcc
+; CGP-NEXT: v_xor_b32_e32 v11, v5, v13
+; CGP-NEXT: v_mul_lo_u32 v5, v15, v4
+; CGP-NEXT: v_mul_lo_u32 v7, v12, v10
+; CGP-NEXT: v_xor_b32_e32 v14, v6, v13
+; CGP-NEXT: v_mul_hi_u32 v6, v12, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v15, v4
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4
-; CGP-NEXT: v_addc_u32_e32 v5, vcc, v12, v5, vcc
-; CGP-NEXT: v_mul_lo_u32 v6, v8, v4
-; CGP-NEXT: v_mul_lo_u32 v7, v9, v5
-; CGP-NEXT: v_mul_hi_u32 v11, v9, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v8, v4
-; CGP-NEXT: v_mul_hi_u32 v12, v8, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v6, v15, v10
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT: v_mul_hi_u32 v7, v12, v10
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; CGP-NEXT: v_mul_hi_u32 v7, v15, v10
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4
+; CGP-NEXT: v_addc_u32_e32 v5, vcc, v15, v5, vcc
+; CGP-NEXT: v_mul_lo_u32 v6, v14, v4
+; CGP-NEXT: v_mul_lo_u32 v7, v11, v5
+; CGP-NEXT: v_mul_hi_u32 v8, v11, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v14, v4
; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v11, v8, v5
+; CGP-NEXT: v_mul_lo_u32 v8, v14, v5
; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT: v_mul_hi_u32 v7, v9, v5
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v7, v11, v5
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v6
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v11, 0
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v4, v6
+; CGP-NEXT: v_mul_hi_u32 v8, v14, v5
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6]
-; CGP-NEXT: v_sub_i32_e32 v4, vcc, v9, v4
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v11, v[5:6]
-; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v8, v5, vcc
-; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v6
+; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[5:6]
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, v11, v4
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v10, v[6:7]
+; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v14, v8, vcc
+; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v14, v8
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3
-; CGP-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v3
+; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc
; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5]
; CGP-NEXT: v_sub_i32_e32 v8, vcc, v4, v2
-; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc
+; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2
-; CGP-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5]
; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
-; CGP-NEXT: v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v13
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v13
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v13
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc
; CGP-NEXT: ; implicit-def: $vgpr6
; CGP-NEXT: ; implicit-def: $vgpr8
; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
@@ -977,82 +977,82 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000
; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT: v_mov_b32_e32 v6, 0xfffff000
+; CHECK-NEXT: v_mov_b32_e32 v9, 0xfffff000
; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; CHECK-NEXT: v_trunc_f32_e32 v4, v3
; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3
-; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3
-; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3
-; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2
+; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v4
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT: v_mul_lo_u32 v5, v8, v6
+; CHECK-NEXT: v_mul_lo_u32 v7, v10, v6
+; CHECK-NEXT: v_mul_hi_u32 v11, v8, v6
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2
-; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6
-; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2
-; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3
-; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6
-; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT: v_mul_hi_u32 v5, v10, v6
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2
+; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT: v_xor_b32_e32 v4, v0, v9
+; CHECK-NEXT: v_mul_lo_u32 v0, v10, v2
+; CHECK-NEXT: v_mul_lo_u32 v3, v8, v6
+; CHECK-NEXT: v_xor_b32_e32 v5, v1, v9
+; CHECK-NEXT: v_mul_hi_u32 v1, v8, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3
+; CHECK-NEXT: v_mul_lo_u32 v1, v10, v6
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT: v_mul_hi_u32 v3, v8, v6
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_mul_hi_u32 v3, v10, v6
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0
; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1
; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0
-; CHECK-NEXT: v_mov_b32_e32 v5, 0x1000
+; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0
+; CHECK-NEXT: v_mov_b32_e32 v6, 0x1000
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1
+; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0
@@ -1060,39 +1060,39 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT: v_mul_hi_u32 v7, v9, v1
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
-; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2]
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v0, v2
+; CHECK-NEXT: v_mul_hi_u32 v8, v5, v1
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v7
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v2
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2]
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0
-; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc
-; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1
-; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v5
-; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5
-; CHECK-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc
-; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v4, v5
+; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc
+; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v5, v2
+; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v6
+; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6
+; CHECK-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc
+; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v4, v6
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2
-; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
+; CHECK-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v2, vcc
; CHECK-NEXT: v_cndmask_b32_e64 v3, -1, v3, s[4:5]
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = srem i64 %num, 4096
ret i64 %result
@@ -1141,92 +1141,92 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v8, v4
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, v8, v4
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
+; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v7, vcc
; GISEL-NEXT: v_mov_b32_e32 v4, v14
-; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13
-; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v4, v19, v13
+; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15]
; GISEL-NEXT: s_mov_b32 s6, 1
; GISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GISEL-NEXT: v_mul_lo_u32 v7, v16, v14
+; GISEL-NEXT: v_mul_lo_u32 v7, v18, v16
; GISEL-NEXT: s_subb_u32 s6, 0, 0
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT: v_mul_hi_u32 v7, v16, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v7, v18, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_mul_hi_u32 v7, v17, v13
-; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14
+; GISEL-NEXT: v_mul_hi_u32 v7, v19, v13
+; GISEL-NEXT: v_mul_lo_u32 v13, v19, v16
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4
+; GISEL-NEXT: v_mul_hi_u32 v14, v18, v16
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v7, v15
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v7, v14
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v7
; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; GISEL-NEXT: v_xor_b32_e32 v18, v0, v7
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4
-; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14
-; GISEL-NEXT: v_xor_b32_e32 v19, v1, v7
+; GISEL-NEXT: v_xor_b32_e32 v17, v0, v7
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v4
+; GISEL-NEXT: v_mul_hi_u32 v4, v19, v16
+; GISEL-NEXT: v_xor_b32_e32 v20, v1, v7
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0
-; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1
-; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v13, v20, v0
+; GISEL-NEXT: v_mul_lo_u32 v14, v17, v1
+; GISEL-NEXT: v_mul_hi_u32 v15, v17, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v20, v0
; GISEL-NEXT: v_mov_b32_e32 v4, 0x1000
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1
+; GISEL-NEXT: v_mul_lo_u32 v15, v20, v1
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1
+; GISEL-NEXT: v_mul_hi_u32 v14, v17, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13
-; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, v0, v13
+; GISEL-NEXT: v_mul_hi_u32 v15, v20, v1
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v18, 0
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14]
-; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v18, v0
-; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13
-; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v13
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2]
+; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14]
+; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v17, v0
+; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v20, v15
+; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v20, v15, vcc
; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v14, v4
-; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4
+; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v16, v4
+; GISEL-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v4
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17
+; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc
; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1]
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4
-; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
-; GISEL-NEXT: v_mul_lo_u32 v18, v8, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18
+; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v1, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14]
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v18, v4
+; GISEL-NEXT: v_mul_lo_u32 v13, v8, v0
+; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v19, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v13
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -1243,74 +1243,74 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v1
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
-; GISEL-NEXT: v_cndmask_b32_e32 v10, v14, v16, vcc
-; GISEL-NEXT: v_xor_b32_e32 v1, v10, v7
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6]
-; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; GISEL-NEXT: v_xor_b32_e32 v12, v2, v10
-; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0
-; GISEL-NEXT: v_mul_lo_u32 v6, v8, v5
-; GISEL-NEXT: v_xor_b32_e32 v13, v3, v10
-; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v8, v1
+; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v9, v0, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15
+; GISEL-NEXT: v_cndmask_b32_e32 v13, v17, v5, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2]
+; GISEL-NEXT: v_cndmask_b32_e32 v12, v16, v14, vcc
+; GISEL-NEXT: v_xor_b32_e32 v1, v12, v7
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9]
+; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; GISEL-NEXT: v_xor_b32_e32 v9, v2, v12
+; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0
+; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5
+; GISEL-NEXT: v_xor_b32_e32 v14, v3, v12
+; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5
+; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT: v_mul_hi_u32 v6, v8, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5
+; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0
-; GISEL-NEXT: v_mul_lo_u32 v5, v12, v2
-; GISEL-NEXT: v_mul_hi_u32 v6, v12, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT: v_xor_b32_e32 v8, v11, v7
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0
+; GISEL-NEXT: v_mul_lo_u32 v5, v9, v2
+; GISEL-NEXT: v_mul_hi_u32 v6, v9, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT: v_xor_b32_e32 v8, v13, v7
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v6, v13, v2
+; GISEL-NEXT: v_mul_lo_u32 v6, v14, v2
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
-; GISEL-NEXT: v_mul_hi_u32 v5, v12, v2
+; GISEL-NEXT: v_mul_hi_u32 v5, v9, v2
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3
-; GISEL-NEXT: v_mul_hi_u32 v6, v13, v2
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3
+; GISEL-NEXT: v_mul_hi_u32 v6, v14, v2
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v0
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v0
; GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, v[0:1]
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v7
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6]
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v14, v7
+; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v14, v7, vcc
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -1330,10 +1330,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12
+; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_srem_v2i64_pow2k_denom:
@@ -1352,110 +1352,110 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8
; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
; CGP-NEXT: v_mov_b32_e32 v9, v5
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; CGP-NEXT: v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10]
; CGP-NEXT: v_mul_hi_u32 v12, v8, v4
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10]
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11]
; CGP-NEXT: v_mul_lo_u32 v10, v8, v4
-; CGP-NEXT: v_mul_lo_u32 v4, v7, v9
-; CGP-NEXT: v_mul_lo_u32 v13, v8, v9
-; CGP-NEXT: v_mul_hi_u32 v14, v7, v9
-; CGP-NEXT: v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT: v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT: v_mul_lo_u32 v4, v7, v13
+; CGP-NEXT: v_mul_lo_u32 v9, v8, v13
+; CGP-NEXT: v_mul_hi_u32 v14, v7, v13
+; CGP-NEXT: v_mul_hi_u32 v13, v8, v13
; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9
+; CGP-NEXT: v_add_i32_e32 v18, vcc, v7, v4
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
+; CGP-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc
; CGP-NEXT: v_mov_b32_e32 v4, v14
-; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; CGP-NEXT: v_mul_lo_u32 v4, v17, v13
-; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
-; CGP-NEXT: v_mul_lo_u32 v9, v16, v14
+; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5]
+; CGP-NEXT: v_mul_lo_u32 v4, v19, v13
+; CGP-NEXT: v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15]
+; CGP-NEXT: v_mul_lo_u32 v9, v18, v16
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT: v_mul_hi_u32 v9, v16, v13
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v9, v18, v13
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_mul_hi_u32 v9, v17, v13
-; CGP-NEXT: v_mul_lo_u32 v13, v17, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_mul_hi_u32 v15, v16, v14
+; CGP-NEXT: v_mul_hi_u32 v9, v19, v13
+; CGP-NEXT: v_mul_lo_u32 v13, v19, v16
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4
+; CGP-NEXT: v_mul_hi_u32 v14, v18, v16
; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v9, v14
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9
; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9
; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; CGP-NEXT: v_xor_b32_e32 v18, v0, v9
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v17, v14
-; CGP-NEXT: v_xor_b32_e32 v19, v1, v9
+; CGP-NEXT: v_xor_b32_e32 v15, v0, v9
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v19, v16
+; CGP-NEXT: v_xor_b32_e32 v17, v1, v9
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1
; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v19, v0
-; CGP-NEXT: v_mul_lo_u32 v14, v18, v1
-; CGP-NEXT: v_mul_hi_u32 v15, v18, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v19, v0
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v0
+; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; CGP-NEXT: v_mul_lo_u32 v13, v17, v0
+; CGP-NEXT: v_mul_lo_u32 v14, v15, v1
+; CGP-NEXT: v_mul_hi_u32 v16, v15, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v17, v0
; CGP-NEXT: v_mov_b32_e32 v4, 0x1000
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v15, v19, v1
+; CGP-NEXT: v_mul_lo_u32 v16, v17, v1
; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_mul_hi_u32 v14, v18, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v14, v15, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
+; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13
-; CGP-NEXT: v_mul_hi_u32 v15, v19, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; CGP-NEXT: v_sub_i32_e32 v14, vcc, v18, v0
-; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13
-; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v0, v13
+; CGP-NEXT: v_mul_hi_u32 v18, v17, v1
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0
+; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v16
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v13
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2]
+; CGP-NEXT: v_sub_i32_e32 v15, vcc, v15, v0
+; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v17, v13
+; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v17, v13, vcc
; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4
-; CGP-NEXT: v_sub_i32_e32 v16, vcc, v14, v4
+; CGP-NEXT: v_sub_i32_e32 v18, vcc, v15, v4
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4
+; CGP-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
-; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4
+; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; CGP-NEXT: v_mov_b32_e32 v0, v5
-; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1]
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
-; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v18, vcc
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v16, v4
-; CGP-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1]
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
+; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v1, vcc
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14]
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v18, v4
+; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v19, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; CGP-NEXT: v_mul_lo_u32 v5, v7, v0
-; CGP-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11
@@ -1473,72 +1473,72 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v16, vcc
-; CGP-NEXT: v_xor_b32_e32 v11, v5, v9
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v17, vcc
-; CGP-NEXT: v_xor_b32_e32 v1, v10, v9
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6]
-; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; CGP-NEXT: v_xor_b32_e32 v12, v2, v10
-; CGP-NEXT: v_mul_lo_u32 v2, v8, v0
-; CGP-NEXT: v_mul_lo_u32 v6, v7, v5
-; CGP-NEXT: v_xor_b32_e32 v13, v3, v10
-; CGP-NEXT: v_mul_hi_u32 v3, v7, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1
+; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v14, vcc
+; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2]
+; CGP-NEXT: v_cndmask_b32_e32 v12, v16, v13, vcc
+; CGP-NEXT: v_xor_b32_e32 v13, v5, v9
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8]
+; CGP-NEXT: v_xor_b32_e32 v1, v12, v9
+; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; CGP-NEXT: v_xor_b32_e32 v7, v2, v12
+; CGP-NEXT: v_mul_lo_u32 v2, v11, v0
+; CGP-NEXT: v_mul_lo_u32 v6, v10, v5
+; CGP-NEXT: v_xor_b32_e32 v8, v3, v12
+; CGP-NEXT: v_mul_hi_u32 v3, v10, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v11, v0
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v3, v8, v5
+; CGP-NEXT: v_mul_lo_u32 v3, v11, v5
; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT: v_mul_hi_u32 v6, v7, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v10, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v8, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v11, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0
-; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; CGP-NEXT: v_mul_lo_u32 v5, v13, v3
-; CGP-NEXT: v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT: v_mul_hi_u32 v7, v12, v3
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0
+; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; CGP-NEXT: v_mul_lo_u32 v5, v8, v3
+; CGP-NEXT: v_mul_lo_u32 v6, v7, v2
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9
; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CGP-NEXT: v_mul_hi_u32 v9, v7, v3
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v7, v13, v2
-; CGP-NEXT: v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT: v_mul_lo_u32 v9, v8, v2
+; CGP-NEXT: v_mul_hi_u32 v3, v8, v3
; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT: v_mul_hi_u32 v6, v12, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v6, v7, v2
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; CGP-NEXT: v_mul_hi_u32 v7, v13, v2
-; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4]
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v3, v5
+; CGP-NEXT: v_mul_hi_u32 v10, v8, v2
+; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v9
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v5
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4]
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2
+; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc
+; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5
; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v4
; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -1558,10 +1558,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; CGP-NEXT: v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v12
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v12
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc
; CGP-NEXT: s_setpc_b64 s[30:31]
%result = srem <2 x i64> %num, <i64 4096, i64 4096>
ret <2 x i64> %result
@@ -1573,82 +1573,82 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb
; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT: v_mov_b32_e32 v6, 0xffed2705
+; CHECK-NEXT: v_mov_b32_e32 v9, 0xffed2705
; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; CHECK-NEXT: v_trunc_f32_e32 v4, v3
; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3
-; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3
-; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3
-; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2
+; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v4
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT: v_mul_lo_u32 v5, v8, v6
+; CHECK-NEXT: v_mul_lo_u32 v7, v10, v6
+; CHECK-NEXT: v_mul_hi_u32 v11, v8, v6
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2
-; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6
-; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2
-; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3
-; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6
-; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT: v_mul_hi_u32 v5, v10, v6
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2
+; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT: v_xor_b32_e32 v4, v0, v9
+; CHECK-NEXT: v_mul_lo_u32 v0, v10, v2
+; CHECK-NEXT: v_mul_lo_u32 v3, v8, v6
+; CHECK-NEXT: v_xor_b32_e32 v5, v1, v9
+; CHECK-NEXT: v_mul_hi_u32 v1, v8, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3
+; CHECK-NEXT: v_mul_lo_u32 v1, v10, v6
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT: v_mul_hi_u32 v3, v8, v6
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_mul_hi_u32 v3, v10, v6
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0
; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1
; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0
-; CHECK-NEXT: v_mov_b32_e32 v5, 0x12d8fb
+; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0
+; CHECK-NEXT: v_mov_b32_e32 v6, 0x12d8fb
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1
+; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0
@@ -1656,39 +1656,39 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT: v_mul_hi_u32 v7, v9, v1
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
-; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2]
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v0, v2
+; CHECK-NEXT: v_mul_hi_u32 v8, v5, v1
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v7
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v2
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2]
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0
-; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc
-; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1
-; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v5
-; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5
-; CHECK-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc
-; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v4, v5
+; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc
+; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v5, v2
+; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v6
+; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6
+; CHECK-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc
+; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v4, v6
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2
-; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
+; CHECK-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v2, vcc
; CHECK-NEXT: v_cndmask_b32_e64 v3, -1, v3, s[4:5]
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = srem i64 %num, 1235195
ret i64 %result
@@ -1737,92 +1737,92 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v8, v4
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, v8, v4
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
+; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v7, vcc
; GISEL-NEXT: v_mov_b32_e32 v4, v14
-; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13
-; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v4, v19, v13
+; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15]
; GISEL-NEXT: s_mov_b32 s6, 1
; GISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GISEL-NEXT: v_mul_lo_u32 v7, v16, v14
+; GISEL-NEXT: v_mul_lo_u32 v7, v18, v16
; GISEL-NEXT: s_subb_u32 s6, 0, 0
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT: v_mul_hi_u32 v7, v16, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v7, v18, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_mul_hi_u32 v7, v17, v13
-; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14
+; GISEL-NEXT: v_mul_hi_u32 v7, v19, v13
+; GISEL-NEXT: v_mul_lo_u32 v13, v19, v16
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4
+; GISEL-NEXT: v_mul_hi_u32 v14, v18, v16
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v7, v15
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v7, v14
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v7
; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; GISEL-NEXT: v_xor_b32_e32 v18, v0, v7
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4
-; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14
-; GISEL-NEXT: v_xor_b32_e32 v19, v1, v7
+; GISEL-NEXT: v_xor_b32_e32 v17, v0, v7
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v4
+; GISEL-NEXT: v_mul_hi_u32 v4, v19, v16
+; GISEL-NEXT: v_xor_b32_e32 v20, v1, v7
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0
-; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1
-; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v13, v20, v0
+; GISEL-NEXT: v_mul_lo_u32 v14, v17, v1
+; GISEL-NEXT: v_mul_hi_u32 v15, v17, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v20, v0
; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1
+; GISEL-NEXT: v_mul_lo_u32 v15, v20, v1
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1
+; GISEL-NEXT: v_mul_hi_u32 v14, v17, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13
-; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, v0, v13
+; GISEL-NEXT: v_mul_hi_u32 v15, v20, v1
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v18, 0
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14]
-; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v18, v0
-; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13
-; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v13
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2]
+; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14]
+; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v17, v0
+; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v20, v15
+; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v20, v15, vcc
; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v14, v4
-; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4
+; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v16, v4
+; GISEL-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v4
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17
+; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc
; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1]
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4
-; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
-; GISEL-NEXT: v_mul_lo_u32 v18, v8, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18
+; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v1, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14]
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v18, v4
+; GISEL-NEXT: v_mul_lo_u32 v13, v8, v0
+; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v19, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v13
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -1839,74 +1839,74 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v1
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
-; GISEL-NEXT: v_cndmask_b32_e32 v10, v14, v16, vcc
-; GISEL-NEXT: v_xor_b32_e32 v1, v10, v7
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6]
-; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; GISEL-NEXT: v_xor_b32_e32 v12, v2, v10
-; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0
-; GISEL-NEXT: v_mul_lo_u32 v6, v8, v5
-; GISEL-NEXT: v_xor_b32_e32 v13, v3, v10
-; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v8, v1
+; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v9, v0, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15
+; GISEL-NEXT: v_cndmask_b32_e32 v13, v17, v5, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2]
+; GISEL-NEXT: v_cndmask_b32_e32 v12, v16, v14, vcc
+; GISEL-NEXT: v_xor_b32_e32 v1, v12, v7
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9]
+; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; GISEL-NEXT: v_xor_b32_e32 v9, v2, v12
+; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0
+; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5
+; GISEL-NEXT: v_xor_b32_e32 v14, v3, v12
+; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5
+; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT: v_mul_hi_u32 v6, v8, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5
+; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0
-; GISEL-NEXT: v_mul_lo_u32 v5, v12, v2
-; GISEL-NEXT: v_mul_hi_u32 v6, v12, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT: v_xor_b32_e32 v8, v11, v7
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0
+; GISEL-NEXT: v_mul_lo_u32 v5, v9, v2
+; GISEL-NEXT: v_mul_hi_u32 v6, v9, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT: v_xor_b32_e32 v8, v13, v7
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v6, v13, v2
+; GISEL-NEXT: v_mul_lo_u32 v6, v14, v2
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
-; GISEL-NEXT: v_mul_hi_u32 v5, v12, v2
+; GISEL-NEXT: v_mul_hi_u32 v5, v9, v2
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3
-; GISEL-NEXT: v_mul_hi_u32 v6, v13, v2
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3
+; GISEL-NEXT: v_mul_hi_u32 v6, v14, v2
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v0
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v0
; GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, v[0:1]
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v7
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6]
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v14, v7
+; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v14, v7, vcc
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -1926,10 +1926,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12
+; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_srem_v2i64_oddk_denom:
@@ -1948,110 +1948,110 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8
; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
; CGP-NEXT: v_mov_b32_e32 v9, v5
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; CGP-NEXT: v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10]
; CGP-NEXT: v_mul_hi_u32 v12, v8, v4
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10]
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11]
; CGP-NEXT: v_mul_lo_u32 v10, v8, v4
-; CGP-NEXT: v_mul_lo_u32 v4, v7, v9
-; CGP-NEXT: v_mul_lo_u32 v13, v8, v9
-; CGP-NEXT: v_mul_hi_u32 v14, v7, v9
-; CGP-NEXT: v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT: v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT: v_mul_lo_u32 v4, v7, v13
+; CGP-NEXT: v_mul_lo_u32 v9, v8, v13
+; CGP-NEXT: v_mul_hi_u32 v14, v7, v13
+; CGP-NEXT: v_mul_hi_u32 v13, v8, v13
; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9
+; CGP-NEXT: v_add_i32_e32 v18, vcc, v7, v4
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
+; CGP-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc
; CGP-NEXT: v_mov_b32_e32 v4, v14
-; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; CGP-NEXT: v_mul_lo_u32 v4, v17, v13
-; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
-; CGP-NEXT: v_mul_lo_u32 v9, v16, v14
+; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5]
+; CGP-NEXT: v_mul_lo_u32 v4, v19, v13
+; CGP-NEXT: v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15]
+; CGP-NEXT: v_mul_lo_u32 v9, v18, v16
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT: v_mul_hi_u32 v9, v16, v13
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v9, v18, v13
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_mul_hi_u32 v9, v17, v13
-; CGP-NEXT: v_mul_lo_u32 v13, v17, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_mul_hi_u32 v15, v16, v14
+; CGP-NEXT: v_mul_hi_u32 v9, v19, v13
+; CGP-NEXT: v_mul_lo_u32 v13, v19, v16
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4
+; CGP-NEXT: v_mul_hi_u32 v14, v18, v16
; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v9, v14
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9
; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9
; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; CGP-NEXT: v_xor_b32_e32 v18, v0, v9
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v17, v14
-; CGP-NEXT: v_xor_b32_e32 v19, v1, v9
+; CGP-NEXT: v_xor_b32_e32 v15, v0, v9
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v19, v16
+; CGP-NEXT: v_xor_b32_e32 v17, v1, v9
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1
; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v19, v0
-; CGP-NEXT: v_mul_lo_u32 v14, v18, v1
-; CGP-NEXT: v_mul_hi_u32 v15, v18, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v19, v0
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v0
+; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; CGP-NEXT: v_mul_lo_u32 v13, v17, v0
+; CGP-NEXT: v_mul_lo_u32 v14, v15, v1
+; CGP-NEXT: v_mul_hi_u32 v16, v15, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v17, v0
; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v15, v19, v1
+; CGP-NEXT: v_mul_lo_u32 v16, v17, v1
; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_mul_hi_u32 v14, v18, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v14, v15, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
+; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13
-; CGP-NEXT: v_mul_hi_u32 v15, v19, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; CGP-NEXT: v_sub_i32_e32 v14, vcc, v18, v0
-; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13
-; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v0, v13
+; CGP-NEXT: v_mul_hi_u32 v18, v17, v1
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0
+; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v16
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v13
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2]
+; CGP-NEXT: v_sub_i32_e32 v15, vcc, v15, v0
+; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v17, v13
+; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v17, v13, vcc
; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4
-; CGP-NEXT: v_sub_i32_e32 v16, vcc, v14, v4
+; CGP-NEXT: v_sub_i32_e32 v18, vcc, v15, v4
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4
+; CGP-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
-; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4
+; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; CGP-NEXT: v_mov_b32_e32 v0, v5
-; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1]
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
-; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v18, vcc
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v16, v4
-; CGP-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1]
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
+; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v1, vcc
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14]
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v18, v4
+; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v19, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; CGP-NEXT: v_mul_lo_u32 v5, v7, v0
-; CGP-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11
@@ -2069,72 +2069,72 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v16, vcc
-; CGP-NEXT: v_xor_b32_e32 v11, v5, v9
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v17, vcc
-; CGP-NEXT: v_xor_b32_e32 v1, v10, v9
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6]
-; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; CGP-NEXT: v_xor_b32_e32 v12, v2, v10
-; CGP-NEXT: v_mul_lo_u32 v2, v8, v0
-; CGP-NEXT: v_mul_lo_u32 v6, v7, v5
-; CGP-NEXT: v_xor_b32_e32 v13, v3, v10
-; CGP-NEXT: v_mul_hi_u32 v3, v7, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1
+; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v14, vcc
+; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2]
+; CGP-NEXT: v_cndmask_b32_e32 v12, v16, v13, vcc
+; CGP-NEXT: v_xor_b32_e32 v13, v5, v9
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8]
+; CGP-NEXT: v_xor_b32_e32 v1, v12, v9
+; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; CGP-NEXT: v_xor_b32_e32 v7, v2, v12
+; CGP-NEXT: v_mul_lo_u32 v2, v11, v0
+; CGP-NEXT: v_mul_lo_u32 v6, v10, v5
+; CGP-NEXT: v_xor_b32_e32 v8, v3, v12
+; CGP-NEXT: v_mul_hi_u32 v3, v10, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v11, v0
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v3, v8, v5
+; CGP-NEXT: v_mul_lo_u32 v3, v11, v5
; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT: v_mul_hi_u32 v6, v7, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v10, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v8, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v11, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0
-; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; CGP-NEXT: v_mul_lo_u32 v5, v13, v3
-; CGP-NEXT: v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT: v_mul_hi_u32 v7, v12, v3
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0
+; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; CGP-NEXT: v_mul_lo_u32 v5, v8, v3
+; CGP-NEXT: v_mul_lo_u32 v6, v7, v2
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9
; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CGP-NEXT: v_mul_hi_u32 v9, v7, v3
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v7, v13, v2
-; CGP-NEXT: v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT: v_mul_lo_u32 v9, v8, v2
+; CGP-NEXT: v_mul_hi_u32 v3, v8, v3
; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT: v_mul_hi_u32 v6, v12, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v6, v7, v2
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; CGP-NEXT: v_mul_hi_u32 v7, v13, v2
-; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4]
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v3, v5
+; CGP-NEXT: v_mul_hi_u32 v10, v8, v2
+; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v9
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v5
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4]
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2
+; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc
+; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5
; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v4
; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -2154,10 +2154,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; CGP-NEXT: v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v12
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v12
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc
; CGP-NEXT: s_setpc_b64 s[30:31]
%result = srem <2 x i64> %num, <i64 1235195, i64 1235195>
ret <2 x i64> %result
@@ -2193,130 +2193,130 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-NEXT: v_xor_b32_e32 v1, v2, v1
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v0
; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v1
-; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v0
-; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc
+; CHECK-NEXT: v_sub_i32_e32 v11, vcc, 0, v0
+; CHECK-NEXT: v_subb_u32_e32 v12, vcc, 0, v1, vcc
; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v5
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2
; CHECK-NEXT: v_trunc_f32_e32 v7, v5
; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v7
-; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v7
-; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v2
+; CHECK-NEXT: v_cvt_u32_f32_e32 v13, v7
+; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0
; CHECK-NEXT: v_mov_b32_e32 v2, v6
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3]
-; CHECK-NEXT: v_mul_lo_u32 v2, v11, v5
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT: v_mul_hi_u32 v7, v8, v5
-; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5
-; CHECK-NEXT: v_mul_lo_u32 v12, v8, v6
-; CHECK-NEXT: v_mul_lo_u32 v13, v11, v6
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v12
-; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[2:3]
+; CHECK-NEXT: v_mul_lo_u32 v2, v13, v5
+; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7]
+; CHECK-NEXT: v_mul_hi_u32 v6, v10, v5
+; CHECK-NEXT: v_mul_hi_u32 v5, v13, v5
+; CHECK-NEXT: v_mul_lo_u32 v7, v10, v8
+; CHECK-NEXT: v_mul_lo_u32 v9, v13, v8
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
-; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT: v_mul_hi_u32 v6, v10, v8
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v12, v2
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v13, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v9, v5
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7
-; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT: v_mul_hi_u32 v7, v13, v8
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2
-; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v5, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v2
+; CHECK-NEXT: v_addc_u32_e32 v13, vcc, v13, v5, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0
; CHECK-NEXT: v_mov_b32_e32 v2, v6
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3]
-; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v4
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v9
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v4, v9, vcc
-; CHECK-NEXT: v_xor_b32_e32 v7, v2, v9
-; CHECK-NEXT: v_mul_lo_u32 v2, v11, v5
-; CHECK-NEXT: v_mul_lo_u32 v4, v8, v6
-; CHECK-NEXT: v_xor_b32_e32 v10, v3, v9
-; CHECK-NEXT: v_mul_hi_u32 v3, v8, v5
-; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[2:3]
+; CHECK-NEXT: v_ashrrev_i32_e32 v11, 31, v4
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v11
+; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7]
+; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v4, v11, vcc
+; CHECK-NEXT: v_xor_b32_e32 v9, v2, v11
+; CHECK-NEXT: v_mul_lo_u32 v2, v13, v5
+; CHECK-NEXT: v_mul_lo_u32 v4, v10, v8
+; CHECK-NEXT: v_xor_b32_e32 v12, v3, v11
+; CHECK-NEXT: v_mul_hi_u32 v3, v10, v5
+; CHECK-NEXT: v_mul_hi_u32 v5, v13, v5
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v3, v11, v6
+; CHECK-NEXT: v_mul_lo_u32 v3, v13, v8
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2
-; CHECK-NEXT: v_mul_hi_u32 v4, v8, v6
+; CHECK-NEXT: v_mul_hi_u32 v4, v10, v8
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT: v_mul_hi_u32 v5, v11, v6
+; CHECK-NEXT: v_mul_hi_u32 v5, v13, v8
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2
-; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc
-; CHECK-NEXT: v_mul_lo_u32 v4, v10, v2
-; CHECK-NEXT: v_mul_lo_u32 v5, v7, v3
-; CHECK-NEXT: v_mul_hi_u32 v6, v7, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
-; CHECK-NEXT: v_mul_hi_u32 v8, v10, v3
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2
+; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc
+; CHECK-NEXT: v_mul_lo_u32 v4, v12, v2
+; CHECK-NEXT: v_mul_lo_u32 v5, v9, v3
+; CHECK-NEXT: v_mul_hi_u32 v6, v9, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v12, v2
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v6, v10, v3
+; CHECK-NEXT: v_mul_lo_u32 v6, v12, v3
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT: v_mul_hi_u32 v5, v7, v3
+; CHECK-NEXT: v_mul_hi_u32 v5, v9, v3
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v2, v4
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v6, 0
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v2, v4
+; CHECK-NEXT: v_mul_hi_u32 v6, v12, v3
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v8, 0
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4]
-; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v6, v[3:4]
-; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc
-; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v4
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v6, v[3:4]
+; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v9, v2
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[4:5]
+; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v6, vcc
+; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v6
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1
-; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v1
+; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc
; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5]
; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v2, v0
-; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc
+; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v4, vcc
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1
; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v6, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5]
; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v11
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v11
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v11
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v11, vcc
; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6
; CHECK-NEXT: ; implicit-def: $vgpr3
; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
@@ -2361,85 +2361,85 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_xor_b32_e32 v7, v10, v7
; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v5
; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v7
-; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 0, v5
-; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v7, vcc
+; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v5
+; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc
; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v10
; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v4
; GISEL-NEXT: v_trunc_f32_e32 v12, v10
; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v12
-; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v12
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v12
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0
; GISEL-NEXT: v_mov_b32_e32 v4, v11
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v4, v16, v10
-; GISEL-NEXT: v_mul_hi_u32 v17, v13, v10
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10
-; GISEL-NEXT: v_mul_lo_u32 v12, v13, v11
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v4, v18, v10
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12]
+; GISEL-NEXT: v_mul_hi_u32 v12, v15, v10
+; GISEL-NEXT: v_mul_hi_u32 v10, v18, v10
+; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v17
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v17, v16, v11
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4
-; GISEL-NEXT: v_mul_hi_u32 v12, v13, v11
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v17, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT: v_mul_lo_u32 v12, v18, v13
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4
+; GISEL-NEXT: v_mul_hi_u32 v11, v15, v13
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v12
-; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT: v_mul_hi_u32 v12, v18, v13
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v4
-; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v16, v10, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4
+; GISEL-NEXT: v_addc_u32_e32 v18, vcc, v18, v10, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0
; GISEL-NEXT: v_mov_b32_e32 v4, v11
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5]
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12]
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
; GISEL-NEXT: v_xor_b32_e32 v12, v0, v4
-; GISEL-NEXT: v_mul_lo_u32 v0, v16, v10
-; GISEL-NEXT: v_mul_lo_u32 v14, v13, v11
-; GISEL-NEXT: v_xor_b32_e32 v15, v1, v4
-; GISEL-NEXT: v_mul_hi_u32 v1, v13, v10
-; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v0, v18, v10
+; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13
+; GISEL-NEXT: v_xor_b32_e32 v16, v1, v4
+; GISEL-NEXT: v_mul_hi_u32 v1, v15, v10
+; GISEL-NEXT: v_mul_hi_u32 v10, v18, v10
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0
-; GISEL-NEXT: v_mul_hi_u32 v14, v13, v11
+; GISEL-NEXT: v_mul_lo_u32 v1, v18, v13
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
+; GISEL-NEXT: v_mul_hi_u32 v11, v15, v13
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14
-; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT: v_mul_hi_u32 v11, v18, v13
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v0
-; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v16, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v15, v10
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v0
+; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v18, v1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v13, v16, v10
; GISEL-NEXT: v_mul_lo_u32 v14, v12, v11
; GISEL-NEXT: v_lshl_b64 v[0:1], v[8:9], v6
; GISEL-NEXT: v_mul_hi_u32 v6, v12, v10
-; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10
+; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v14
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v8, v15, v11
+; GISEL-NEXT: v_mul_lo_u32 v8, v16, v11
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
; GISEL-NEXT: v_mul_hi_u32 v9, v12, v11
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
@@ -2448,127 +2448,127 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v8, v6
-; GISEL-NEXT: v_mul_hi_u32 v8, v15, v11
+; GISEL-NEXT: v_mul_hi_u32 v8, v16, v11
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v8, v6
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v8, v6
; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc
; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v13, 0
; GISEL-NEXT: v_xor_b32_e32 v6, v0, v8
; GISEL-NEXT: v_xor_b32_e32 v8, v1, v8
-; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v6
-; GISEL-NEXT: v_cvt_f32_u32_e32 v16, v8
+; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v6
+; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v8
; GISEL-NEXT: v_mov_b32_e32 v0, v10
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, v[0:1]
-; GISEL-NEXT: v_mac_f32_e32 v14, 0x4f800000, v16
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v14
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v13, v[0:1]
-; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6
-; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v10
+; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v6
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v5, v14, v[0:1]
+; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v15
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v14, v1
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v13, v[10:11]
+; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v8, vcc
+; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v14
; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v1
; GISEL-NEXT: v_trunc_f32_e32 v13, v10
; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v13
-; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v1
-; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13
-; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v8, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v14, 0
+; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v1
+; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v13
; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v12, v9
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v15, 0
+; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], v16, v0, vcc
; GISEL-NEXT: v_mov_b32_e32 v1, v11
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v13, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v1, v13, v10
-; GISEL-NEXT: v_subb_u32_e64 v18, s[4:5], v15, v0, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v14, v[11:12]
-; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v15, v0
-; GISEL-NEXT: v_mul_lo_u32 v12, v14, v11
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v18, v7
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v19, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v1, v19, v10
+; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v16, v0
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v18, v15, v[11:12]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v20, v7
; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc
-; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12
-; GISEL-NEXT: v_mul_hi_u32 v12, v14, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12
+; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13
+; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v11
+; GISEL-NEXT: v_mul_hi_u32 v11, v15, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v11
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[6:7]
; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[6:7]
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[6:7]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v20, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v11, v1, v11, s[6:7]
; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v9, v5
-; GISEL-NEXT: v_subbrev_u32_e64 v19, s[6:7], 0, v0, vcc
+; GISEL-NEXT: v_subbrev_u32_e64 v14, s[6:7], 0, v0, vcc
; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v19, v7
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v14, v7
; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9]
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[8:9]
; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v7
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v14, v7
; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v1, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7]
+; GISEL-NEXT: v_cndmask_b32_e64 v16, v16, v21, s[6:7]
; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v7, v19, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v7, v14, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_hi_u32 v1, v13, v10
-; GISEL-NEXT: v_mul_lo_u32 v10, v13, v11
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0
-; GISEL-NEXT: v_mul_hi_u32 v15, v14, v11
+; GISEL-NEXT: v_mul_hi_u32 v1, v19, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v19, v13
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT: v_mul_hi_u32 v12, v15, v13
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15
-; GISEL-NEXT: v_mul_hi_u32 v11, v13, v11
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v19, v13
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v0
-; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v11, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v0
+; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v19, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v13, 0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
; GISEL-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[1:2]
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v14, v[1:2]
; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4
; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v11, v[9:10]
-; GISEL-NEXT: v_cndmask_b32_e32 v7, v18, v7, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v18, v13, v[9:10]
+; GISEL-NEXT: v_cndmask_b32_e32 v7, v20, v7, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
; GISEL-NEXT: v_xor_b32_e32 v12, v2, v5
-; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0
-; GISEL-NEXT: v_mul_lo_u32 v10, v11, v9
-; GISEL-NEXT: v_xor_b32_e32 v14, v3, v5
-; GISEL-NEXT: v_mul_hi_u32 v3, v11, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0
+; GISEL-NEXT: v_mul_lo_u32 v9, v13, v11
+; GISEL-NEXT: v_xor_b32_e32 v15, v3, v5
+; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v13, v9
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2
-; GISEL-NEXT: v_mul_hi_u32 v10, v11, v9
+; GISEL-NEXT: v_mul_lo_u32 v3, v14, v11
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2
+; GISEL-NEXT: v_mul_hi_u32 v9, v13, v11
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10
-; GISEL-NEXT: v_mul_hi_u32 v9, v13, v9
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
+; GISEL-NEXT: v_mul_hi_u32 v9, v14, v11
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0
; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2
; GISEL-NEXT: v_mul_hi_u32 v10, v12, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0
; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v10, v14, v2
+; GISEL-NEXT: v_mul_lo_u32 v10, v15, v2
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3
; GISEL-NEXT: v_mul_hi_u32 v9, v12, v2
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
@@ -2577,19 +2577,19 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3
-; GISEL-NEXT: v_mul_hi_u32 v10, v14, v2
+; GISEL-NEXT: v_mul_hi_u32 v10, v15, v2
; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v0
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v10, v0
; GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v13, v[0:1]
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v4, vcc
; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v11, v[9:10]
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc
-; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3
+; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc
+; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v8
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6
@@ -2645,103 +2645,103 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_xor_b32_e32 v1, v4, v1
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v0
; CGP-NEXT: v_cvt_f32_u32_e32 v10, v1
-; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v0
-; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v1, vcc
+; CGP-NEXT: v_sub_i32_e32 v16, vcc, 0, v0
+; CGP-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc
; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v10
; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4
; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v4
; CGP-NEXT: v_trunc_f32_e32 v12, v10
; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v12
-; CGP-NEXT: v_cvt_u32_f32_e32 v13, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v16, v12
-; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; CGP-NEXT: v_cvt_u32_f32_e32 v15, v4
+; CGP-NEXT: v_cvt_u32_f32_e32 v18, v12
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0
; CGP-NEXT: v_mov_b32_e32 v4, v11
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
-; CGP-NEXT: v_mul_lo_u32 v4, v16, v10
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; CGP-NEXT: v_mul_hi_u32 v12, v13, v10
-; CGP-NEXT: v_mul_hi_u32 v10, v16, v10
-; CGP-NEXT: v_mul_lo_u32 v17, v13, v11
-; CGP-NEXT: v_mul_lo_u32 v18, v16, v11
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v17
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5]
+; CGP-NEXT: v_mul_lo_u32 v4, v18, v10
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12]
+; CGP-NEXT: v_mul_hi_u32 v11, v15, v10
+; CGP-NEXT: v_mul_hi_u32 v10, v18, v10
+; CGP-NEXT: v_mul_lo_u32 v12, v15, v13
+; CGP-NEXT: v_mul_lo_u32 v14, v18, v13
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12
-; CGP-NEXT: v_mul_hi_u32 v12, v13, v11
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT: v_mul_hi_u32 v11, v15, v13
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v17, v4
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v18, v10
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v14, v10
; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12
-; CGP-NEXT: v_mul_hi_u32 v11, v16, v11
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT: v_mul_hi_u32 v12, v18, v13
; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10
; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v4
-; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v10, vcc
-; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v4
+; CGP-NEXT: v_addc_u32_e32 v18, vcc, v18, v10, vcc
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0
; CGP-NEXT: v_mov_b32_e32 v4, v11
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
-; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v14
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v14, vcc
-; CGP-NEXT: v_xor_b32_e32 v12, v4, v14
-; CGP-NEXT: v_mul_lo_u32 v4, v16, v10
-; CGP-NEXT: v_mul_lo_u32 v9, v13, v11
-; CGP-NEXT: v_xor_b32_e32 v15, v8, v14
-; CGP-NEXT: v_mul_hi_u32 v8, v13, v10
-; CGP-NEXT: v_mul_hi_u32 v10, v16, v10
+; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5]
+; CGP-NEXT: v_ashrrev_i32_e32 v16, 31, v9
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v16
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12]
+; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v16, vcc
+; CGP-NEXT: v_xor_b32_e32 v14, v4, v16
+; CGP-NEXT: v_mul_lo_u32 v4, v18, v10
+; CGP-NEXT: v_mul_lo_u32 v9, v15, v13
+; CGP-NEXT: v_xor_b32_e32 v17, v8, v16
+; CGP-NEXT: v_mul_hi_u32 v8, v15, v10
+; CGP-NEXT: v_mul_hi_u32 v10, v18, v10
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v8, v16, v11
+; CGP-NEXT: v_mul_lo_u32 v8, v18, v13
; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4
-; CGP-NEXT: v_mul_hi_u32 v9, v13, v11
+; CGP-NEXT: v_mul_hi_u32 v9, v15, v13
; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT: v_mul_hi_u32 v10, v16, v11
+; CGP-NEXT: v_mul_hi_u32 v10, v18, v13
; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, v16, v8, vcc
-; CGP-NEXT: v_mul_lo_u32 v9, v15, v4
-; CGP-NEXT: v_mul_lo_u32 v10, v12, v8
-; CGP-NEXT: v_mul_hi_u32 v11, v12, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v15, v4
-; CGP-NEXT: v_mul_hi_u32 v13, v15, v8
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
+; CGP-NEXT: v_addc_u32_e32 v8, vcc, v18, v8, vcc
+; CGP-NEXT: v_mul_lo_u32 v9, v17, v4
+; CGP-NEXT: v_mul_lo_u32 v10, v14, v8
+; CGP-NEXT: v_mul_hi_u32 v11, v14, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v17, v4
; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v11, v15, v8
+; CGP-NEXT: v_mul_lo_u32 v11, v17, v8
; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT: v_mul_hi_u32 v10, v12, v8
+; CGP-NEXT: v_mul_hi_u32 v10, v14, v8
; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4
; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v9
-; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v11, 0
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v4, v9
+; CGP-NEXT: v_mul_hi_u32 v11, v17, v8
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v13, 0
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v4
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v4
; CGP-NEXT: v_mov_b32_e32 v4, v9
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v10, v[4:5]
-; CGP-NEXT: v_sub_i32_e32 v4, vcc, v12, v8
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v1, v11, v[9:10]
-; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v15, v9, vcc
-; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v15, v9
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v11, v[4:5]
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, v14, v8
+; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v1, v13, v[9:10]
+; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v17, v11, vcc
+; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v17, v11
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v0
@@ -2754,11 +2754,11 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v1
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v0
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v1
; CGP-NEXT: v_subb_u32_e32 v1, vcc, v9, v1, vcc
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v0
-; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v15, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5]
; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
; CGP-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc
@@ -2766,10 +2766,10 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
-; CGP-NEXT: v_xor_b32_e32 v0, v0, v14
-; CGP-NEXT: v_xor_b32_e32 v1, v1, v14
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v14
-; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v14, vcc
+; CGP-NEXT: v_xor_b32_e32 v0, v0, v16
+; CGP-NEXT: v_xor_b32_e32 v1, v1, v16
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v16
+; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v16, vcc
; CGP-NEXT: ; implicit-def: $vgpr11_vgpr12
; CGP-NEXT: ; implicit-def: $vgpr8
; CGP-NEXT: .LBB8_2: ; %Flow1
@@ -2819,117 +2819,117 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_xor_b32_e32 v3, v4, v3
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
-; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v2
-; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc
+; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v2
+; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc
; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6
; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4
; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4
; CGP-NEXT: v_trunc_f32_e32 v6, v6
; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6
-; CGP-NEXT: v_cvt_u32_f32_e32 v11, v4
+; CGP-NEXT: v_cvt_u32_f32_e32 v13, v4
; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0
; CGP-NEXT: v_mov_b32_e32 v4, v9
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5]
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v6, v[4:5]
; CGP-NEXT: v_mul_lo_u32 v4, v6, v8
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10]
-; CGP-NEXT: v_mul_hi_u32 v10, v11, v8
+; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10]
+; CGP-NEXT: v_mul_hi_u32 v9, v13, v8
; CGP-NEXT: v_mul_hi_u32 v8, v6, v8
-; CGP-NEXT: v_mul_lo_u32 v14, v11, v9
-; CGP-NEXT: v_mul_lo_u32 v15, v6, v9
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v10, v13, v11
+; CGP-NEXT: v_mul_lo_u32 v12, v6, v11
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10
-; CGP-NEXT: v_mul_hi_u32 v10, v11, v9
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; CGP-NEXT: v_mul_hi_u32 v9, v13, v11
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v14, v10
-; CGP-NEXT: v_mul_hi_u32 v9, v6, v9
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT: v_mul_hi_u32 v10, v6, v11
; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8
; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v4
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v4
; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc
-; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0
; CGP-NEXT: v_mov_b32_e32 v4, v9
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5]
-; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v12
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10]
-; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v12, vcc
-; CGP-NEXT: v_xor_b32_e32 v7, v4, v12
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v6, v[4:5]
+; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v7
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v14
+; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10]
+; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v14, vcc
+; CGP-NEXT: v_xor_b32_e32 v10, v4, v14
; CGP-NEXT: v_mul_lo_u32 v4, v6, v8
-; CGP-NEXT: v_mul_lo_u32 v10, v11, v9
-; CGP-NEXT: v_xor_b32_e32 v13, v5, v12
-; CGP-NEXT: v_mul_hi_u32 v5, v11, v8
+; CGP-NEXT: v_mul_lo_u32 v7, v13, v11
+; CGP-NEXT: v_xor_b32_e32 v12, v5, v14
+; CGP-NEXT: v_mul_hi_u32 v5, v13, v8
; CGP-NEXT: v_mul_hi_u32 v8, v6, v8
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v5, v6, v9
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT: v_mul_hi_u32 v10, v11, v9
+; CGP-NEXT: v_mul_lo_u32 v5, v6, v11
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; CGP-NEXT: v_mul_hi_u32 v7, v13, v11
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10
-; CGP-NEXT: v_mul_hi_u32 v9, v6, v9
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT: v_mul_hi_u32 v8, v6, v11
; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
; CGP-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc
-; CGP-NEXT: v_mul_lo_u32 v6, v13, v4
-; CGP-NEXT: v_mul_lo_u32 v8, v7, v5
-; CGP-NEXT: v_mul_hi_u32 v9, v7, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v13, v4
-; CGP-NEXT: v_mul_hi_u32 v10, v13, v5
+; CGP-NEXT: v_mul_lo_u32 v6, v12, v4
+; CGP-NEXT: v_mul_lo_u32 v7, v10, v5
+; CGP-NEXT: v_mul_hi_u32 v8, v10, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v12, v4
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8
-; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v9, v13, v5
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6
-; CGP-NEXT: v_mul_hi_u32 v8, v7, v5
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4
-; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; CGP-NEXT: v_mul_lo_u32 v8, v12, v5
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CGP-NEXT: v_mul_hi_u32 v7, v10, v5
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v4, v6
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v6
+; CGP-NEXT: v_mul_hi_u32 v8, v12, v5
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v11, 0
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6]
-; CGP-NEXT: v_sub_i32_e32 v4, vcc, v7, v4
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[5:6]
-; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v13, v5, vcc
-; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v6
+; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[5:6]
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[6:7]
+; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v8, vcc
+; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v12, v8
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3
-; CGP-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v3
+; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc
; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5]
; CGP-NEXT: v_sub_i32_e32 v8, vcc, v4, v2
-; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc
+; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2
; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2
; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5]
; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
@@ -2938,11 +2938,11 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
-; CGP-NEXT: v_xor_b32_e32 v2, v2, v12
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v12
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v14
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v14
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v14
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v14, vcc
; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10
; CGP-NEXT: ; implicit-def: $vgpr5
; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
@@ -3004,15 +3004,15 @@ define i64 @v_srem_i64_24bit(i64 %num, i64 %den) {
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3
-; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0
; CGP-NEXT: v_rcp_f32_e32 v1, v1
; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v1
; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v3
-; CGP-NEXT: v_mul_lo_u32 v1, v1, v4
-; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v1, 0
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v2
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
+; CGP-NEXT: v_mul_lo_u32 v5, v1, v4
+; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v5, 0
+; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
; CGP-NEXT: v_mul_lo_u32 v0, v1, v3
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v5, v0
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v3
@@ -3035,196 +3035,196 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4
; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1
-; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v7, 0
-; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v1
-; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v7
+; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v9, 0
+; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1
+; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v9
; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, vcc
; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3
; GISEL-NEXT: v_trunc_f32_e32 v5, v4
; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5
-; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v3
-; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5]
-; GISEL-NEXT: v_mul_hi_u32 v12, v8, v3
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3
-; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3
-; GISEL-NEXT: v_mul_lo_u32 v13, v8, v4
-; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v3
+; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v10, 0
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[7:8]
+; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3
+; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3
+; GISEL-NEXT: v_mul_lo_u32 v8, v10, v4
+; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3
+; GISEL-NEXT: v_mul_lo_u32 v14, v13, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_mul_hi_u32 v7, v10, v4
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v3
-; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5]
-; GISEL-NEXT: v_mul_hi_u32 v9, v8, v3
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5]
-; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0
-; GISEL-NEXT: v_mul_lo_u32 v0, v11, v3
-; GISEL-NEXT: v_mul_lo_u32 v5, v8, v4
-; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v3
+; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v10, 0
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5]
+; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v0
+; GISEL-NEXT: v_mul_lo_u32 v0, v13, v3
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[7:8]
+; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3
+; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3
+; GISEL-NEXT: v_mul_lo_u32 v5, v10, v4
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v9, v11, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, v13, v4
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT: v_mul_hi_u32 v5, v8, v4
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v5, v10, v4
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
-; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v3, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v13, v3, vcc
; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0
-; GISEL-NEXT: v_mul_lo_u32 v5, v10, v8
+; GISEL-NEXT: v_mul_lo_u32 v5, v11, v7
; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6
-; GISEL-NEXT: v_mul_hi_u32 v6, v10, v0
+; GISEL-NEXT: v_mul_hi_u32 v6, v11, v0
; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_mul_lo_u32 v5, 0, v8
+; GISEL-NEXT: v_mul_lo_u32 v5, 0, v7
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6
-; GISEL-NEXT: v_mul_hi_u32 v6, v10, v8
+; GISEL-NEXT: v_mul_hi_u32 v6, v11, v7
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v4
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v4
; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v9, 0
-; GISEL-NEXT: v_mul_hi_u32 v6, 0, v8
-; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v7
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v0
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v12, 0
+; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7
+; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v9
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v0
; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v6, v[0:1]
-; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v7
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v7, v[0:1]
+; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v8
; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v0
-; GISEL-NEXT: v_trunc_f32_e32 v11, v7
-; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v11
-; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v0
-; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v3
-; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v12, 0
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
+; GISEL-NEXT: v_trunc_f32_e32 v9, v7
+; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v9
+; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v0
+; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 0, v3
+; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v9
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v14, v13, 0
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], 0, v12, v[5:6]
; GISEL-NEXT: v_mov_b32_e32 v0, v8
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1]
-; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v12, v[8:9]
-; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v10, v4
-; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], 0, v5
-; GISEL-NEXT: v_mul_lo_u32 v4, v11, v7
-; GISEL-NEXT: v_mul_lo_u32 v5, v12, v8
-; GISEL-NEXT: v_mul_hi_u32 v9, v12, v7
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v16, v[0:1]
+; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v11, v4
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v13, v[5:6]
+; GISEL-NEXT: v_mul_lo_u32 v4, v16, v7
+; GISEL-NEXT: v_mul_hi_u32 v8, v13, v7
+; GISEL-NEXT: v_mul_lo_u32 v5, v13, v10
+; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, v9, vcc
+; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], 0, v9
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v1
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v9
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v8
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v9, v11, v8
-; GISEL-NEXT: v_mul_hi_u32 v7, v11, v7
+; GISEL-NEXT: v_mul_lo_u32 v8, v16, v10
+; GISEL-NEXT: v_mul_hi_u32 v7, v16, v7
; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4
-; GISEL-NEXT: v_mul_hi_u32 v5, v12, v8
-; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v5, v13, v10
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5]
; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7
-; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7
+; GISEL-NEXT: v_mul_hi_u32 v8, v16, v10
; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5]
; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5
; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5
-; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v4
-; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v11, v5, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v7, 0
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v0, vcc
+; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v4
+; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v16, v5, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v9, 0
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17
+; GISEL-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v0, vcc
; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v9, -1, v6, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v8, v[0:1]
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v1
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v7, v[5:6]
-; GISEL-NEXT: v_mul_lo_u32 v12, v8, v4
-; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v11, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v7, v5
-; GISEL-NEXT: v_mul_hi_u32 v14, v7, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v11, -1, v6, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v10, v[0:1]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v12, v1
+; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v7, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v15, v9, v[5:6]
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11
-; GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v14, v8, v5
-; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT: v_mul_hi_u32 v13, v7, v5
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4
-; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v8, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v6, v10, v4
+; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13
+; GISEL-NEXT: v_cndmask_b32_e32 v14, -1, v5, vcc
+; GISEL-NEXT: v_mul_hi_u32 v5, v9, v4
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v6, v10, v7
+; GISEL-NEXT: v_mul_hi_u32 v4, v10, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT: v_mul_hi_u32 v7, v10, v7
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v10, v5, vcc
; GISEL-NEXT: v_mul_lo_u32 v5, 0, v4
-; GISEL-NEXT: v_mul_lo_u32 v8, v2, v7
-; GISEL-NEXT: v_mul_hi_u32 v13, v2, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, v2, v6
+; GISEL-NEXT: v_mul_hi_u32 v9, v2, v4
; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v0, v1
-; GISEL-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v11, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT: v_mul_lo_u32 v8, 0, v7
+; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v13, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_mul_lo_u32 v7, 0, v6
; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT: v_mul_hi_u32 v13, v2, v7
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT: v_mul_hi_u32 v9, v2, v6
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v4, v5
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v8, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
-; GISEL-NEXT: v_mul_hi_u32 v6, 0, v7
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v4, v5
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0
+; GISEL-NEXT: v_mul_hi_u32 v7, 0, v6
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v6, v[0:1]
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v8, v[5:6]
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v7, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v13, v8, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v12, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v9, v[5:6]
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v7
+; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v7, vcc
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v3
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -3264,15 +3264,15 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0
; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v7
; CGP-NEXT: v_cvt_u32_f32_e32 v6, v0
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v1
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v5, 0
; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v4
-; CGP-NEXT: v_mul_lo_u32 v0, v0, v6
-; CGP-NEXT: v_mul_lo_u32 v5, v1, v3
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v6, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0
+; CGP-NEXT: v_mul_lo_u32 v5, v0, v6
+; CGP-NEXT: v_mul_lo_u32 v7, v1, v3
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0
+; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v7
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v1
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v6, 0
; CGP-NEXT: v_sub_i32_e32 v7, vcc, v5, v3
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3
; CGP-NEXT: v_mul_lo_u32 v6, v1, v4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 9e412b6..23ef596 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -132,65 +132,64 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_trunc_f32_e32 v2, v1
; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
; GFX8-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0
; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1
; GFX8-NEXT: v_mul_hi_u32 v4, s8, v0
; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0
-; GFX8-NEXT: v_mul_hi_u32 v5, s9, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
@@ -203,54 +202,55 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2
+; GFX8-NEXT: v_mul_hi_u32 v4, s9, v1
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v3, v[1:2]
-; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s8, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2]
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v2, s9
-; GFX8-NEXT: v_mov_b32_e32 v5, s11
-; GFX8-NEXT: v_subb_u32_e64 v7, s[0:1], v2, v1, vcc
-; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s9, v1
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v7
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v6
+; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s8, v0
+; GFX8-NEXT: v_subb_u32_e64 v5, s[0:1], v2, v4, vcc
+; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s9, v4
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v5
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v7
-; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s10, v6
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v5
+; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s10, v3
; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v0, vcc
-; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v4
-; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
+; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v6
+; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v7, s[0:1]
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v8
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2
-; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v4
+; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v8
-; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s10, v2
+; GFX8-NEXT: v_subrev_u32_e32 v14, vcc, s10, v4
; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1]
; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9
-; GFX8-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v0, vcc
+; GFX8-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v0, vcc
; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
+; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v14, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v15, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v9, v10, v13, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v0, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v10, v13, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v9, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v14, vcc
; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
@@ -271,63 +271,64 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_trunc_f32_e32 v2, v1
; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
; GFX9-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
-; GFX9-NEXT: v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
+; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX9-NEXT: v_mul_hi_u32 v4, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
+; GFX9-NEXT: v_add_u32_e32 v1, v3, v1
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v4
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX9-NEXT: v_mov_b32_e32 v8, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
; GFX9-NEXT: v_mul_lo_u32 v2, s17, v0
; GFX9-NEXT: v_mul_lo_u32 v3, s16, v1
; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0
; GFX9-NEXT: v_mul_hi_u32 v0, s17, v0
-; GFX9-NEXT: v_mul_hi_u32 v6, s17, v1
+; GFX9-NEXT: v_mul_hi_u32 v5, s17, v1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
@@ -339,53 +340,52 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v5, 0
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v6, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v3, v[1:2]
-; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, s16, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s19, v5, v[1:2]
+; GFX9-NEXT: v_add3_u32 v7, v3, v2, v5
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s18, v7, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v1, s19
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s19, v6, v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, s17
-; GFX9-NEXT: v_mov_b32_e32 v4, s19
-; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v2, v1, vcc
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v8
-; GFX9-NEXT: v_sub_u32_e32 v0, s17, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v7
+; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s16, v0
+; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v2, v4, vcc
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v5
+; GFX9-NEXT: v_sub_u32_e32 v0, s17, v4
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v8
-; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s18, v7
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v5
+; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s18, v3
; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v0, vcc
-; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5
-; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
+; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v6
+; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v7, s[0:1]
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v9
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v4
+; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v9
-; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s18, v2
+; GFX9-NEXT: v_subrev_co_u32_e32 v15, vcc, s18, v4
; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1]
; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10
-; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc
+; GFX9-NEXT: v_subbrev_co_u32_e32 v16, vcc, 0, v0, vcc
; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v13, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v14, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
-; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v10, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v15, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[0:1]
-; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[12:13]
-; GFX9-NEXT: global_store_dwordx2 v6, v[2:3], s[14:15]
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v11, v14, vcc
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v15, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v16, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v8, v[0:1], s[12:13]
+; GFX9-NEXT: global_store_dwordx2 v8, v[2:3], s[14:15]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: udivrem_i64:
@@ -1005,72 +1005,72 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX8-NEXT: v_mov_b32_e32 v10, s13
; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX8-NEXT: v_trunc_f32_e32 v2, v1
; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
; GFX8-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
; GFX8-NEXT: s_sub_u32 s2, 0, s14
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4
; GFX8-NEXT: s_subb_u32 s3, 0, s15
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0
; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1
; GFX8-NEXT: v_mul_hi_u32 v4, s8, v0
; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0
-; GFX8-NEXT: v_mov_b32_e32 v5, s13
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
@@ -1083,136 +1083,136 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v0, v2
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v0, v2
; GFX8-NEXT: v_mul_hi_u32 v4, s9, v1
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v7, 0
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v8, 0
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v2
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v8, v[1:2]
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s13, v7, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v3, s9
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v4, v2
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v9, v[1:2]
; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s8, v0
-; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v3, v2, vcc
-; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s9, v2
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s13, v8, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v2, s9
+; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v2, v4, vcc
+; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s9, v4
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v0
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v1
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v9, v3, v4, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v11, v3, v4, s[0:1]
; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s15
; GFX8-NEXT: v_cvt_f32_u32_e32 v4, s14
-; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v2, v5, vcc
+; GFX8-NEXT: v_subb_u32_e32 v12, vcc, v2, v10, vcc
; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f800000, v3
; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s12, v1
-; GFX8-NEXT: v_subbrev_u32_e64 v11, s[0:1], 0, v6, vcc
+; GFX8-NEXT: v_subrev_u32_e32 v13, vcc, s12, v1
+; GFX8-NEXT: v_subbrev_u32_e64 v14, s[0:1], 0, v12, vcc
; GFX8-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; GFX8-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; GFX8-NEXT: v_trunc_f32_e32 v4, v3
; GFX8-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4
; GFX8-NEXT: v_add_f32_e32 v2, v3, v2
-; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v2
-; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v7
-; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v8, s[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v4
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v11
-; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v5, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[3:4]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v10
-; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v11
-; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1]
-; GFX8-NEXT: v_mul_lo_u32 v4, v15, v2
-; GFX8-NEXT: v_mul_lo_u32 v17, v12, v3
-; GFX8-NEXT: v_mul_hi_u32 v6, v12, v2
-; GFX8-NEXT: v_mul_hi_u32 v2, v15, v2
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v17
-; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v2
+; GFX8-NEXT: v_add_u32_e64 v16, s[0:1], 1, v8
+; GFX8-NEXT: v_addc_u32_e64 v17, s[0:1], 0, v9, s[0:1]
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, 0
+; GFX8-NEXT: v_cvt_u32_f32_e32 v18, v4
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v14
+; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[0:1]
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v18, v[3:4]
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v13
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v15, v[4:5]
+; GFX8-NEXT: v_mul_lo_u32 v4, v18, v2
+; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v12, v10, vcc
+; GFX8-NEXT: v_mul_lo_u32 v5, v15, v6
+; GFX8-NEXT: v_mul_hi_u32 v10, v15, v2
+; GFX8-NEXT: v_mul_hi_u32 v2, v18, v2
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v14
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5
+; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v10
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v6, v15, v3
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v17, v4
-; GFX8-NEXT: v_mul_hi_u32 v17, v12, v3
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v17
-; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v17
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, 1, v13
-; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v14, vcc
-; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v10
-; GFX8-NEXT: v_mul_hi_u32 v3, v15, v3
-; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v5, vcc
+; GFX8-NEXT: v_mul_lo_u32 v10, v18, v6
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4
+; GFX8-NEXT: v_mul_hi_u32 v5, v15, v6
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v10, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
+; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v10, v5
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 1, v16
+; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v17, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[0:1]
+; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v13
+; GFX8-NEXT: v_mul_hi_u32 v6, v18, v6
+; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v7, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4
-; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v2
-; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v12, 0
-; GFX8-NEXT: v_addc_u32_e32 v15, vcc, v15, v3, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v13, v17, vcc
+; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v2
+; GFX8-NEXT: v_addc_u32_e32 v18, vcc, v18, v4, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, 0
+; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v10, vcc
; GFX8-NEXT: v_mov_b32_e32 v2, v5
-; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v15, v[2:3]
-; GFX8-NEXT: v_cndmask_b32_e32 v13, v14, v18, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s3, v12, v[5:6]
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v3, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v8, v13, s[0:1]
-; GFX8-NEXT: v_mul_lo_u32 v7, v15, v4
-; GFX8-NEXT: v_mul_lo_u32 v8, v12, v5
-; GFX8-NEXT: v_mul_hi_u32 v9, v12, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v19, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v10, v11, v20, vcc
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v18, v[2:3]
+; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v3, s[0:1]
+; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[2:3], s3, v15, v[5:6]
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v12, vcc
+; GFX8-NEXT: v_mul_lo_u32 v6, v18, v4
+; GFX8-NEXT: v_mul_lo_u32 v8, v15, v7
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v10, s[0:1]
+; GFX8-NEXT: v_mul_hi_u32 v9, v15, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v19, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v14, v20, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v9, v15, v5
-; GFX8-NEXT: v_mul_hi_u32 v4, v15, v4
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7
-; GFX8-NEXT: v_mul_hi_u32 v8, v12, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v9
+; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v9, v18, v7
+; GFX8-NEXT: v_mul_hi_u32 v4, v18, v4
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6
+; GFX8-NEXT: v_mul_hi_u32 v8, v15, v7
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v9, v4
; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v9, v8
-; GFX8-NEXT: v_mul_hi_u32 v5, v15, v5
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v7
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v12, v4
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v15, v5, vcc
-; GFX8-NEXT: v_mul_lo_u32 v7, s11, v4
-; GFX8-NEXT: v_mul_lo_u32 v8, s10, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v6, s[0:1]
+; GFX8-NEXT: v_mul_hi_u32 v7, v18, v7
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v15, v4
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v18, v6, vcc
+; GFX8-NEXT: v_mul_lo_u32 v8, s11, v4
+; GFX8-NEXT: v_mul_lo_u32 v9, s10, v7
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v5, s[0:1]
; GFX8-NEXT: v_mul_hi_u32 v1, s10, v4
; GFX8-NEXT: v_mul_hi_u32 v4, s11, v4
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v8, v9
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v7, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v5, v1
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v7, s11, v5
+; GFX8-NEXT: v_mul_lo_u32 v5, s11, v7
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v8, v1
-; GFX8-NEXT: v_mul_hi_u32 v8, s10, v5
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX8-NEXT: v_mul_hi_u32 v8, s10, v7
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4
+; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v5, v8
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v4, v1
-; GFX8-NEXT: v_mul_hi_u32 v8, s11, v5
+; GFX8-NEXT: v_mul_hi_u32 v7, s11, v7
; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s14, v11, 0
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v7, v1
-; GFX8-NEXT: v_add_u32_e32 v12, vcc, v8, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v8, v1
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, v7, v1
; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s14, v12, v[1:2]
; GFX8-NEXT: v_cndmask_b32_e64 v7, v0, v10, s[0:1]
@@ -1279,60 +1279,61 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_trunc_f32_e32 v2, v1
; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
; GFX9-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
-; GFX9-NEXT: v_add_u32_e32 v2, v6, v2
+; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX9-NEXT: v_mul_hi_u32 v4, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
+; GFX9-NEXT: v_add_u32_e32 v1, v3, v1
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v4
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX9-NEXT: v_mov_b32_e32 v8, s5
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
; GFX9-NEXT: s_sub_u32 s2, 0, s6
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4
; GFX9-NEXT: s_subb_u32 s3, 0, s7
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1
-; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v6, v5
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
; GFX9-NEXT: v_mul_lo_u32 v2, s17, v0
; GFX9-NEXT: v_mul_lo_u32 v3, s16, v1
; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0
@@ -1349,114 +1350,113 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v7, 0
+; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v0, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v9, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT: v_add3_u32 v8, v3, v2, v5
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s4, v8, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v7, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v3, s17
+; GFX9-NEXT: v_add3_u32 v10, v3, v2, v5
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s4, v10, v[1:2]
; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s16, v0
-; GFX9-NEXT: v_subb_co_u32_e64 v0, s[0:1], v3, v2, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v9, v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, s17
+; GFX9-NEXT: v_subb_co_u32_e64 v0, s[0:1], v2, v4, vcc
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v0
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v1
+; GFX9-NEXT: v_sub_u32_e32 v2, s17, v4
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v9, v3, v4, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v4, s[0:1]
; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s7
; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6
-; GFX9-NEXT: v_sub_u32_e32 v2, s17, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v2, v5, vcc
+; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, v2, v8, vcc
; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v3
; GFX9-NEXT: v_add_f32_e32 v2, v2, v4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s4, v1
-; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v6, vcc
+; GFX9-NEXT: v_subrev_co_u32_e32 v13, vcc, s4, v1
+; GFX9-NEXT: v_subbrev_co_u32_e64 v14, s[0:1], 0, v12, vcc
; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; GFX9-NEXT: v_trunc_f32_e32 v4, v3
; GFX9-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4
; GFX9-NEXT: v_add_f32_e32 v2, v3, v2
-; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v2
-; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v7
-; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v8, s[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v4
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v11
-; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v5, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[3:4]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v10
-; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v11
-; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1]
-; GFX9-NEXT: v_mul_lo_u32 v4, v15, v2
-; GFX9-NEXT: v_mul_lo_u32 v17, v12, v3
-; GFX9-NEXT: v_mul_hi_u32 v5, v12, v2
-; GFX9-NEXT: v_mul_hi_u32 v2, v15, v2
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v17
-; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v2
+; GFX9-NEXT: v_add_co_u32_e64 v16, s[0:1], 1, v9
+; GFX9-NEXT: v_addc_co_u32_e64 v17, s[0:1], 0, v10, s[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, 0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v18, v4
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v14
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v18, v[3:4]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v13
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v15, v[4:5]
+; GFX9-NEXT: v_mul_lo_u32 v4, v18, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v12, v8, vcc
+; GFX9-NEXT: v_mul_lo_u32 v5, v15, v6
+; GFX9-NEXT: v_mul_hi_u32 v8, v15, v2
+; GFX9-NEXT: v_mul_hi_u32 v2, v18, v2
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v14
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v5, v15, v3
-; GFX9-NEXT: v_add_u32_e32 v4, v17, v4
-; GFX9-NEXT: v_mul_hi_u32 v17, v12, v3
-; GFX9-NEXT: v_mul_hi_u32 v3, v15, v3
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v2
+; GFX9-NEXT: v_mul_lo_u32 v8, v18, v6
+; GFX9-NEXT: v_add_u32_e32 v4, v5, v4
+; GFX9-NEXT: v_mul_hi_u32 v5, v15, v6
+; GFX9-NEXT: v_mul_hi_u32 v6, v18, v6
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v17
-; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX9-NEXT: v_add_u32_e32 v5, v5, v17
-; GFX9-NEXT: v_add_co_u32_e32 v17, vcc, 1, v13
-; GFX9-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v14, vcc
+; GFX9-NEXT: v_add_u32_e32 v5, v8, v5
+; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 1, v16
+; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v17, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v2
-; GFX9-NEXT: v_add3_u32 v3, v5, v4, v3
-; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v12, 0
-; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v3, vcc
+; GFX9-NEXT: v_add3_u32 v4, v5, v4, v6
+; GFX9-NEXT: v_add_co_u32_e32 v15, vcc, v15, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[0:1]
+; GFX9-NEXT: v_addc_co_u32_e32 v18, vcc, v18, v4, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, 0
+; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v13
; GFX9-NEXT: v_mov_b32_e32 v2, v5
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, v[2:3]
-; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v10
-; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v6, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s3, v12, v[2:3]
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX9-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v13, s[0:1]
-; GFX9-NEXT: v_mul_lo_u32 v6, v15, v4
-; GFX9-NEXT: v_mul_lo_u32 v7, v12, v5
-; GFX9-NEXT: v_mul_hi_u32 v9, v12, v4
-; GFX9-NEXT: v_mul_hi_u32 v4, v15, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v14, v14, v18, vcc
-; GFX9-NEXT: v_add_co_u32_e64 v6, s[2:3], v6, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[2:3]
-; GFX9-NEXT: v_add_co_u32_e64 v6, s[2:3], v6, v9
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v18, v[2:3]
+; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v7, vcc
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v16, v8, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s3, v15, v[5:6]
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11
+; GFX9-NEXT: v_mul_lo_u32 v5, v18, v4
+; GFX9-NEXT: v_mul_lo_u32 v6, v15, v7
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v9, v2, s[0:1]
+; GFX9-NEXT: v_mul_hi_u32 v9, v15, v4
+; GFX9-NEXT: v_mul_hi_u32 v4, v18, v4
+; GFX9-NEXT: v_add_co_u32_e64 v5, s[2:3], v5, v6
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3]
-; GFX9-NEXT: v_mul_lo_u32 v9, v15, v5
-; GFX9-NEXT: v_add_u32_e32 v6, v7, v6
-; GFX9-NEXT: v_mul_hi_u32 v7, v12, v5
-; GFX9-NEXT: v_mul_hi_u32 v5, v15, v5
+; GFX9-NEXT: v_add_co_u32_e64 v5, s[2:3], v5, v9
+; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3]
+; GFX9-NEXT: v_mul_lo_u32 v9, v18, v7
+; GFX9-NEXT: v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT: v_mul_hi_u32 v6, v15, v7
+; GFX9-NEXT: v_mul_hi_u32 v7, v18, v7
; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v9, v4
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[2:3]
-; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v4, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[2:3]
; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v4, v6
-; GFX9-NEXT: v_add_u32_e32 v7, v9, v7
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3]
-; GFX9-NEXT: v_add3_u32 v5, v7, v6, v5
-; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v12, v4
-; GFX9-NEXT: v_addc_co_u32_e64 v5, s[2:3], v15, v5, s[2:3]
+; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v4, v5
+; GFX9-NEXT: v_add_u32_e32 v6, v9, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3]
+; GFX9-NEXT: v_add3_u32 v5, v6, v5, v7
+; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v15, v4
+; GFX9-NEXT: v_addc_co_u32_e64 v5, s[2:3], v18, v5, s[2:3]
; GFX9-NEXT: v_mul_lo_u32 v6, s19, v4
; GFX9-NEXT: v_mul_lo_u32 v7, s18, v5
; GFX9-NEXT: v_mul_hi_u32 v9, s18, v4
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v14, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v19, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v20, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e32 v8, v13, v19, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v20, vcc
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7
; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v9
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll
new file mode 100644
index 0000000..d7d623a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor %s | FileCheck %s
+
+; Make sure we do not infer anything about implicit inputs through an
+; intrinsic call which is not nocallback.
+
+declare zeroext i32 @return_i32()
+
+define i32 @test_i32_return() gc "statepoint-example" {
+; CHECK-LABEL: define i32 @test_i32_return(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] gc "statepoint-example" {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[SAFEPOINT_TOKEN:%.*]] = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(i32 ()) @return_i32, i32 0, i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[CALL1:%.*]] = call zeroext i32 @llvm.experimental.gc.result.i32(token [[SAFEPOINT_TOKEN]])
+; CHECK-NEXT: ret i32 [[CALL1]]
+;
+entry:
+ %safepoint_token = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(i32 ()) @return_i32, i32 0, i32 0, i32 0, i32 0)
+ %call1 = call zeroext i32 @llvm.experimental.gc.result.i32(token %safepoint_token)
+ ret i32 %call1
+}
+
+declare token @llvm.experimental.gc.statepoint.p0(i64 immarg, i32 immarg, ptr, i32 immarg, i32 immarg, ...)
+declare i32 @llvm.experimental.gc.result.i32(token) #0
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
+;.
+; CHECK: attributes #[[ATTR0]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll
new file mode 100644
index 0000000..71c509a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-attributor -mcpu=gfx90a %s | FileCheck %s
+
+; Make sure we infer no inputs are used through some intrinsics
+
+define void @use_fake_use(i32 %arg) {
+; CHECK-LABEL: define void @use_fake_use(
+; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[ARG]])
+; CHECK-NEXT: ret void
+;
+ call void (...) @llvm.fake.use(i32 %arg)
+ ret void
+}
+
+define void @use_donothing() {
+; CHECK-LABEL: define void @use_donothing(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: ret void
+;
+ call void @llvm.donothing()
+ ret void
+}
+
+define void @use_assume(i1 %arg) {
+; CHECK-LABEL: define void @use_assume(
+; CHECK-SAME: i1 [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.assume(i1 [[ARG]])
+; CHECK-NEXT: ret void
+;
+ call void @llvm.assume(i1 %arg)
+ ret void
+}
+
+define void @use_trap() {
+; CHECK-LABEL: define void @use_trap(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: call void @llvm.trap()
+; CHECK-NEXT: ret void
+;
+ call void @llvm.trap()
+ ret void
+}
+
+define void @use_debugtrap() {
+; CHECK-LABEL: define void @use_debugtrap(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT: call void @llvm.debugtrap()
+; CHECK-NEXT: ret void
+;
+ call void @llvm.debugtrap()
+ ret void
+}
+
+define void @use_ubsantrap() {
+; CHECK-LABEL: define void @use_ubsantrap(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT: call void @llvm.ubsantrap(i8 0)
+; CHECK-NEXT: ret void
+;
+ call void @llvm.ubsantrap(i8 0)
+ ret void
+}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nounwind "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR6:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) "target-cpu"="gfx90a" }
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
new file mode 100644
index 0000000..06150e42
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
@@ -0,0 +1,519 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s 2>&1 | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefix=GFX942 %s
+
+; These situations are "special" in that they either have an alloca that is not
+; in the entry block or that they have a dynamic alloca. Both situations affect
+; prolog/epilog generation.
+
+declare amdgpu_gfx void @foo()
+
+define amdgpu_cs_chain void @test_alloca() {
+; GFX12-LABEL: test_alloca:
+; GFX12: ; %bb.0: ; %.entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_mov_b32 s32, 16
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_mov_b32 s0, s32
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_add_co_i32 s32, s0, 0x200
+; GFX12-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_alloca:
+; GFX942: ; %bb.0: ; %.entry
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: s_mov_b32 s32, 16
+; GFX942-NEXT: s_mov_b32 s0, s32
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_add_i32 s32, s0, 0x400
+; GFX942-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NEXT: s_endpgm
+.entry:
+ br label %SW_C
+
+SW_C: ; preds = %.entry
+ %v = alloca i32, i32 1, align 4, addrspace(5)
+ store i32 0, ptr addrspace(5) %v, align 4
+ ret void
+}
+
+define amdgpu_cs_chain void @test_alloca_var_uniform(i32 inreg %count) {
+; GFX12-LABEL: test_alloca_var_uniform:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_lshl_b32 s0, s0, 2
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_add_co_i32 s0, s0, 15
+; GFX12-NEXT: s_mov_b32 s32, 16
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_and_b32 s0, s0, -16
+; GFX12-NEXT: s_mov_b32 s1, s32
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_lshl_b32 s0, s0, 5
+; GFX12-NEXT: scratch_store_b32 off, v0, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_add_co_i32 s32, s1, s0
+; GFX12-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_alloca_var_uniform:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: s_lshl_b32 s0, s0, 2
+; GFX942-NEXT: s_add_i32 s0, s0, 15
+; GFX942-NEXT: s_mov_b32 s32, 16
+; GFX942-NEXT: s_and_b32 s0, s0, -16
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_lshl_b32 s0, s0, 6
+; GFX942-NEXT: s_mov_b32 s1, s32
+; GFX942-NEXT: s_add_i32 s32, s1, s0
+; GFX942-NEXT: scratch_store_dword off, v0, s1
+; GFX942-NEXT: s_endpgm
+ %v = alloca i32, i32 %count, align 4, addrspace(5)
+ store i32 0, ptr addrspace(5) %v, align 4
+ ret void
+}
+
+define amdgpu_cs_chain void @test_alloca_var(i32 %count) {
+; GFX12-LABEL: test_alloca_var:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_lshl_add_u32 v0, v8, 2, 15
+; GFX12-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_mov_b32 s32, 16
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, -16, v0
+; GFX12-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_ctz_i32_b32 s2, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_readlane_b32 s3, v1, s2
+; GFX12-NEXT: s_bitset0_b32 s1, s2
+; GFX12-NEXT: s_max_u32 s0, s0, s3
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12-NEXT: s_cbranch_scc1 .LBB2_1
+; GFX12-NEXT: ; %bb.2:
+; GFX12-NEXT: s_mov_b32 s1, s32
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_lshl_add_u32 v1, s0, 5, s1
+; GFX12-NEXT: scratch_store_b32 off, v0, s1
+; GFX12-NEXT: v_readfirstlane_b32 s32, v1
+; GFX12-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_alloca_var:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_lshl_add_u32 v0, v8, 2, 15
+; GFX942-NEXT: v_and_b32_e32 v1, -16, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_mov_b64 s[0:1], exec
+; GFX942-NEXT: s_mov_b32 s2, 0
+; GFX942-NEXT: s_mov_b32 s32, 16
+; GFX942-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1]
+; GFX942-NEXT: v_readlane_b32 s4, v1, s3
+; GFX942-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX942-NEXT: s_max_u32 s2, s2, s4
+; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT: s_cbranch_scc1 .LBB2_1
+; GFX942-NEXT: ; %bb.2:
+; GFX942-NEXT: s_mov_b32 s0, s32
+; GFX942-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-NEXT: v_lshl_add_u32 v1, s2, 6, v1
+; GFX942-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NEXT: v_readfirstlane_b32 s32, v1
+; GFX942-NEXT: s_endpgm
+ %v = alloca i32, i32 %count, align 4, addrspace(5)
+ store i32 0, ptr addrspace(5) %v, align 4
+ ret void
+}
+
+define amdgpu_cs_chain void @test_alloca_and_call() {
+; GFX12-LABEL: test_alloca_and_call:
+; GFX12: ; %bb.0: ; %.entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_getpc_b64 s[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_sext_i32_i16 s1, s1
+; GFX12-NEXT: s_add_co_u32 s0, s0, foo@gotpcrel32@lo+12
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_add_co_ci_u32 s1, s1, foo@gotpcrel32@hi+24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT: s_mov_b32 s32, 16
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_mov_b32 s2, s32
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_add_co_i32 s32, s2, 0x200
+; GFX12-NEXT: scratch_store_b32 off, v0, s2
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX12-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_alloca_and_call:
+; GFX942: ; %bb.0: ; %.entry
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: s_getpc_b64 s[0:1]
+; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT: s_mov_b32 s32, 16
+; GFX942-NEXT: s_mov_b32 s2, s32
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_add_i32 s32, s2, 0x400
+; GFX942-NEXT: scratch_store_dword off, v0, s2
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT: s_endpgm
+.entry:
+ br label %SW_C
+
+SW_C: ; preds = %.entry
+ %v = alloca i32, i32 1, align 4, addrspace(5)
+ store i32 0, ptr addrspace(5) %v, align 4
+ call amdgpu_gfx void @foo()
+ ret void
+}
+
+define amdgpu_cs_chain void @test_alloca_and_call_var_uniform(i32 inreg %count) {
+; GFX12-LABEL: test_alloca_and_call_var_uniform:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_getpc_b64 s[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_sext_i32_i16 s3, s3
+; GFX12-NEXT: s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24
+; GFX12-NEXT: s_lshl_b32 s0, s0, 2
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_add_co_i32 s0, s0, 15
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_mov_b32 s32, 16
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_and_b32 s0, s0, -16
+; GFX12-NEXT: s_mov_b32 s1, s32
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_lshl_b32 s0, s0, 5
+; GFX12-NEXT: scratch_store_b32 off, v0, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_add_co_i32 s32, s1, s0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX12-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_alloca_and_call_var_uniform:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: s_lshl_b32 s0, s0, 2
+; GFX942-NEXT: s_add_i32 s0, s0, 15
+; GFX942-NEXT: s_and_b32 s0, s0, -16
+; GFX942-NEXT: s_lshl_b32 s2, s0, 6
+; GFX942-NEXT: s_getpc_b64 s[0:1]
+; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT: s_mov_b32 s32, 16
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_mov_b32 s3, s32
+; GFX942-NEXT: s_add_i32 s32, s3, s2
+; GFX942-NEXT: scratch_store_dword off, v0, s3
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT: s_endpgm
+ %v = alloca i32, i32 %count, align 4, addrspace(5)
+ store i32 0, ptr addrspace(5) %v, align 4
+ call amdgpu_gfx void @foo()
+ ret void
+}
+
+define amdgpu_cs_chain void @test_alloca_and_call_var(i32 %count) {
+; GFX12-LABEL: test_alloca_and_call_var:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_lshl_add_u32 v0, v8, 2, 15
+; GFX12-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_mov_b32 s32, 16
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, -16, v0
+; GFX12-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_ctz_i32_b32 s2, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_readlane_b32 s3, v1, s2
+; GFX12-NEXT: s_bitset0_b32 s1, s2
+; GFX12-NEXT: s_max_u32 s0, s0, s3
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX12-NEXT: ; %bb.2:
+; GFX12-NEXT: s_getpc_b64 s[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_sext_i32_i16 s3, s3
+; GFX12-NEXT: s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24
+; GFX12-NEXT: s_mov_b32 s1, s32
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: v_lshl_add_u32 v1, s0, 5, s1
+; GFX12-NEXT: scratch_store_b32 off, v0, s1
+; GFX12-NEXT: v_readfirstlane_b32 s32, v1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX12-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_alloca_and_call_var:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_lshl_add_u32 v0, v8, 2, 15
+; GFX942-NEXT: v_and_b32_e32 v1, -16, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_mov_b64 s[0:1], exec
+; GFX942-NEXT: s_mov_b32 s2, 0
+; GFX942-NEXT: s_mov_b32 s32, 16
+; GFX942-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1]
+; GFX942-NEXT: v_readlane_b32 s4, v1, s3
+; GFX942-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX942-NEXT: s_max_u32 s2, s2, s4
+; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX942-NEXT: ; %bb.2:
+; GFX942-NEXT: s_getpc_b64 s[0:1]
+; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT: s_mov_b32 s3, s32
+; GFX942-NEXT: v_mov_b32_e32 v1, s3
+; GFX942-NEXT: v_lshl_add_u32 v1, s2, 6, v1
+; GFX942-NEXT: scratch_store_dword off, v0, s3
+; GFX942-NEXT: v_readfirstlane_b32 s32, v1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT: s_endpgm
+ %v = alloca i32, i32 %count, align 4, addrspace(5)
+ store i32 0, ptr addrspace(5) %v, align 4
+ call amdgpu_gfx void @foo()
+ ret void
+}
+
+define amdgpu_cs_chain void @test_call_and_alloca() {
+; GFX12-LABEL: test_call_and_alloca:
+; GFX12: ; %bb.0: ; %.entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_getpc_b64 s[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_sext_i32_i16 s1, s1
+; GFX12-NEXT: s_add_co_u32 s0, s0, foo@gotpcrel32@lo+12
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_add_co_ci_u32 s1, s1, foo@gotpcrel32@hi+24
+; GFX12-NEXT: s_mov_b32 s32, 16
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT: s_mov_b32 s4, s32
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_add_co_i32 s32, s4, 0x200
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: scratch_store_b32 off, v0, s4
+; GFX12-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_call_and_alloca:
+; GFX942: ; %bb.0: ; %.entry
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: s_getpc_b64 s[0:1]
+; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT: s_mov_b32 s32, 16
+; GFX942-NEXT: s_mov_b32 s4, s32
+; GFX942-NEXT: s_add_i32 s32, s4, 0x400
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: scratch_store_dword off, v0, s4
+; GFX942-NEXT: s_endpgm
+.entry:
+ br label %SW_C
+
+SW_C: ; preds = %.entry
+ %v = alloca i32, i32 1, align 4, addrspace(5)
+ call amdgpu_gfx void @foo()
+ store i32 0, ptr addrspace(5) %v, align 4
+ ret void
+}
+
+define amdgpu_cs_chain void @test_call_and_alloca_var_uniform(i32 inreg %count) {
+; GFX12-LABEL: test_call_and_alloca_var_uniform:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_getpc_b64 s[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_sext_i32_i16 s3, s3
+; GFX12-NEXT: s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24
+; GFX12-NEXT: s_lshl_b32 s0, s0, 2
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_add_co_i32 s0, s0, 15
+; GFX12-NEXT: s_mov_b32 s32, 16
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_and_b32 s0, s0, -16
+; GFX12-NEXT: s_mov_b32 s4, s32
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_lshl_b32 s0, s0, 5
+; GFX12-NEXT: v_mov_b32_e32 v40, 0
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_add_co_i32 s32, s4, s0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX12-NEXT: scratch_store_b32 off, v40, s4
+; GFX12-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_call_and_alloca_var_uniform:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: s_lshl_b32 s0, s0, 2
+; GFX942-NEXT: s_add_i32 s0, s0, 15
+; GFX942-NEXT: s_and_b32 s0, s0, -16
+; GFX942-NEXT: s_lshl_b32 s2, s0, 6
+; GFX942-NEXT: s_getpc_b64 s[0:1]
+; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT: s_mov_b32 s32, 16
+; GFX942-NEXT: s_mov_b32 s4, s32
+; GFX942-NEXT: v_mov_b32_e32 v40, 0
+; GFX942-NEXT: s_add_i32 s32, s4, s2
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT: scratch_store_dword off, v40, s4
+; GFX942-NEXT: s_endpgm
+ %v = alloca i32, i32 %count, align 4, addrspace(5)
+ call amdgpu_gfx void @foo()
+ store i32 0, ptr addrspace(5) %v, align 4
+ ret void
+}
+
+define amdgpu_cs_chain void @test_call_and_alloca_var(i32 %count) {
+; GFX12-LABEL: test_call_and_alloca_var:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_lshl_add_u32 v0, v8, 2, 15
+; GFX12-NEXT: v_mov_b32_e32 v40, 0
+; GFX12-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_mov_b32 s32, 16
+; GFX12-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX12-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_ctz_i32_b32 s2, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_readlane_b32 s3, v0, s2
+; GFX12-NEXT: s_bitset0_b32 s1, s2
+; GFX12-NEXT: s_max_u32 s0, s0, s3
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX12-NEXT: ; %bb.2:
+; GFX12-NEXT: s_getpc_b64 s[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_sext_i32_i16 s3, s3
+; GFX12-NEXT: s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24
+; GFX12-NEXT: s_mov_b32 s4, s32
+; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: v_lshl_add_u32 v0, s0, 5, s4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_readfirstlane_b32 s32, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX12-NEXT: scratch_store_b32 off, v40, s4
+; GFX12-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_call_and_alloca_var:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_lshl_add_u32 v0, v8, 2, 15
+; GFX942-NEXT: v_and_b32_e32 v0, -16, v0
+; GFX942-NEXT: v_mov_b32_e32 v40, 0
+; GFX942-NEXT: s_mov_b64 s[0:1], exec
+; GFX942-NEXT: s_mov_b32 s2, 0
+; GFX942-NEXT: s_mov_b32 s32, 16
+; GFX942-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1]
+; GFX942-NEXT: v_readlane_b32 s4, v0, s3
+; GFX942-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX942-NEXT: s_max_u32 s2, s2, s4
+; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX942-NEXT: ; %bb.2:
+; GFX942-NEXT: s_getpc_b64 s[0:1]
+; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT: s_mov_b32 s4, s32
+; GFX942-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-NEXT: v_lshl_add_u32 v0, s2, 6, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_readfirstlane_b32 s32, v0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT: scratch_store_dword off, v40, s4
+; GFX942-NEXT: s_endpgm
+ %v = alloca i32, i32 %count, align 4, addrspace(5)
+ call amdgpu_gfx void @foo()
+ store i32 0, ptr addrspace(5) %v, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
index f6ae516..89d0394 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
@@ -1489,7 +1489,7 @@ attributes #2 = { noinline }
!0 = !{float 3.0}
;.
; CHECK: attributes #[[ATTR0]] = { strictfp }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
; CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind memory(read) }
; CHECK: attributes #[[ATTR3]] = { noinline }
; CHECK: attributes #[[ATTR4]] = { nobuiltin }
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 0fc54ae..26f7789 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -2407,51 +2407,52 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v30, v31, 0
; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v30, v18, 0
-; GISEL-NEXT: v_mul_lo_u32 v24, v30, v19
-; GISEL-NEXT: v_mul_lo_u32 v25, v29, v18
+; GISEL-NEXT: v_mul_lo_u32 v27, v30, v19
+; GISEL-NEXT: v_mul_lo_u32 v36, v29, v18
; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v35, v20, 0
; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v35, v2, 0
-; GISEL-NEXT: v_mul_lo_u32 v26, v35, v3
-; GISEL-NEXT: v_mul_lo_u32 v27, v34, v2
+; GISEL-NEXT: v_mul_lo_u32 v37, v35, v3
+; GISEL-NEXT: v_mul_lo_u32 v38, v34, v2
; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v29, v32, v[14:15]
; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v34, v21, v[22:23]
; GISEL-NEXT: v_mov_b32_e32 v22, v19
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v31, v[2:3]
-; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v4, v20, v[14:15]
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v30, v32, v[1:2]
-; GISEL-NEXT: v_mov_b32_e32 v23, v14
-; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v35, v21, v[22:23]
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v29, v31, v[1:2]
-; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], v3, v24, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v34, v20, v[22:23]
-; GISEL-NEXT: v_addc_u32_e64 v14, s[6:7], v15, v26, s[6:7]
-; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v25, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v10, v31, v[2:3]
+; GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v4, v20, v[14:15]
+; GISEL-NEXT: v_mov_b32_e32 v2, v23
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], vcc, v30, v32, v[1:2]
+; GISEL-NEXT: v_mov_b32_e32 v23, v25
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v35, v21, v[22:23]
+; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v29, v31, v[14:15]
+; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], v24, v27, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v34, v20, v[1:2]
+; GISEL-NEXT: v_addc_u32_e64 v2, s[6:7], v26, v37, s[6:7]
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v3, v36, vcc
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v16, v0
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT: v_xor_b32_e32 v15, v0, v28
-; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v14, v27, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v18
-; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v13, v22, s[4:5]
-; GISEL-NEXT: v_xor_b32_e32 v16, v12, v33
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v10, v32, v[3:4]
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v28
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v4, v21, v[0:1]
-; GISEL-NEXT: v_xor_b32_e32 v14, v14, v33
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v11, v31, v[12:13]
-; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], v15, v28
+; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v17, v22, vcc
+; GISEL-NEXT: v_xor_b32_e32 v19, v0, v28
+; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v2, v38, s[4:5]
+; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v12, v18
+; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v13, v14, s[4:5]
+; GISEL-NEXT: v_xor_b32_e32 v18, v2, v33
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v10, v32, v[1:2]
+; GISEL-NEXT: v_xor_b32_e32 v1, v16, v28
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v4, v21, v[0:1]
+; GISEL-NEXT: v_xor_b32_e32 v10, v14, v33
+; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v11, v31, v[2:3]
+; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], v19, v28
; GISEL-NEXT: v_subb_u32_e64 v1, s[6:7], v1, v28, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v5, v20, v[3:4]
-; GISEL-NEXT: v_sub_i32_e64 v4, s[8:9], v16, v33
-; GISEL-NEXT: v_subb_u32_e64 v5, s[8:9], v14, v33, s[8:9]
-; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v8, v2, vcc
-; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v9, v10, vcc
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v28
-; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v23, s[4:5]
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v5, v20, v[12:13]
+; GISEL-NEXT: v_sub_i32_e64 v4, s[8:9], v18, v33
+; GISEL-NEXT: v_subb_u32_e64 v5, s[8:9], v10, v33, s[8:9]
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v8, v23, vcc
+; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v9, v16, vcc
+; GISEL-NEXT: v_xor_b32_e32 v3, v3, v28
+; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v15, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v7, v2, vcc
; GISEL-NEXT: v_xor_b32_e32 v6, v6, v33
; GISEL-NEXT: v_xor_b32_e32 v7, v8, v28
-; GISEL-NEXT: v_xor_b32_e32 v8, v3, v33
-; GISEL-NEXT: v_subb_u32_e64 v2, vcc, v2, v28, s[6:7]
+; GISEL-NEXT: v_xor_b32_e32 v8, v2, v33
+; GISEL-NEXT: v_subb_u32_e64 v2, vcc, v3, v28, s[6:7]
; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v7, v28, vcc
; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v33, s[8:9]
; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v8, v33, vcc
@@ -3216,36 +3217,38 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v32, 0
; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v8, v20, 0
-; GISEL-NEXT: v_mul_lo_u32 v28, v8, v21
-; GISEL-NEXT: v_mul_lo_u32 v29, v9, v20
+; GISEL-NEXT: v_mul_lo_u32 v34, v8, v21
+; GISEL-NEXT: v_mul_lo_u32 v35, v9, v20
; GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v12, v24, 0
; GISEL-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v12, v18, 0
-; GISEL-NEXT: v_mul_lo_u32 v30, v12, v19
-; GISEL-NEXT: v_mul_lo_u32 v31, v13, v18
+; GISEL-NEXT: v_mul_lo_u32 v36, v12, v19
+; GISEL-NEXT: v_mul_lo_u32 v37, v13, v18
; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v33, v[22:23]
; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v13, v25, v[26:27]
-; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v10, v32, v[18:19]
-; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v14, v24, v[22:23]
-; GISEL-NEXT: v_mad_u64_u32 v[17:18], vcc, v8, v33, v[17:18]
-; GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v12, v25, v[21:22]
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[6:7], v9, v32, v[17:18]
-; GISEL-NEXT: v_addc_u32_e64 v17, s[6:7], v19, v28, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v13, v24, v[21:22]
-; GISEL-NEXT: v_addc_u32_e64 v18, s[6:7], v23, v30, s[6:7]
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v29, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v10, v32, v[18:19]
+; GISEL-NEXT: v_mad_u64_u32 v[28:29], s[4:5], v14, v24, v[22:23]
+; GISEL-NEXT: v_mov_b32_e32 v18, v26
+; GISEL-NEXT: v_mad_u64_u32 v[30:31], vcc, v8, v33, v[17:18]
+; GISEL-NEXT: v_mov_b32_e32 v22, v28
+; GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v12, v25, v[21:22]
+; GISEL-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v9, v32, v[30:31]
+; GISEL-NEXT: v_addc_u32_e64 v12, s[6:7], v27, v34, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[6:7], v13, v24, v[17:18]
+; GISEL-NEXT: v_addc_u32_e64 v13, s[6:7], v29, v36, s[6:7]
+; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v35, vcc
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
-; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v18, v31, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v21, vcc
+; GISEL-NEXT: v_addc_u32_e64 v13, s[4:5], v13, v37, s[4:5]
; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v4, v20
-; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v5, v12, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v10, v33, v[17:18]
-; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v14, v25, v[8:9]
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v11, v32, v[16:17]
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v15, v24, v[18:19]
-; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v9, vcc
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
-; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v13, s[4:5]
-; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v7, v11, vcc
+; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v5, v8, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v10, v33, v[12:13]
+; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v14, v25, v[13:14]
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v11, v32, v[16:17]
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v15, v24, v[18:19]
+; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v22, vcc
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc
+; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v9, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v7, v10, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
%shl = urem <2 x i128> %lhs, %rhs
ret <2 x i128> %shl
diff --git a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
index 0548bcf..279f429 100644
--- a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
@@ -34,3 +34,16 @@ body: |
$vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec
dead $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
...
+
+---
+name: test_tied
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: test_tied
+ ; CHECK: BUNDLE implicit-def %0, implicit-def %2, implicit %1:vgpr_32(tied-def 1), implicit $mode, implicit $exec {
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY %1:vgpr_32
+ ; CHECK-NEXT: [[V_FMAC_F16_e32_:%[0-9]+]]:vgpr_32 = V_FMAC_F16_e32 internal [[COPY]], internal [[COPY]], %1:vgpr_32, implicit $mode, implicit $exec
+ ; CHECK-NEXT: }
+ %1:vgpr_32 = COPY %0:vgpr_32
+ %2:vgpr_32 = V_FMAC_F16_e32 %1, %1, %0, implicit $mode, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index e042157..460f121 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -237,31 +237,31 @@ define i128 @fptosi_f64_to_i128(double %x) {
; GISEL-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6
; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5]
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
-; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v9, 0
+; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0
; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffb8d, v6
; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7
; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5]
; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v9, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v8, 0
-; GISEL-NEXT: v_mov_b32_e32 v2, v6
-; GISEL-NEXT: v_mul_lo_u32 v6, v10, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v10, v9, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v4, v11, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0
+; GISEL-NEXT: v_mov_b32_e32 v2, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9
; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7]
+; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr9
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr6
; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GISEL-NEXT: ; implicit-def: $vgpr8
@@ -275,17 +275,18 @@ define i128 @fptosi_f64_to_i128(double %x) {
; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v4, v9, 0
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3]
-; GISEL-NEXT: v_mul_lo_u32 v6, v5, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v4, v4, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7]
-; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v6, v9, 0
+; GISEL-NEXT: v_cndmask_b32_e32 v7, v1, v5, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v6, v8, 0
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v9, v[2:3]
+; GISEL-NEXT: v_mul_lo_u32 v10, v7, v9
+; GISEL-NEXT: v_mov_b32_e32 v2, v4
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], vcc, v6, v9, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v6, v6, v9
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[3:4]
+; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v5, v6, s[6:7]
+; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v10, vcc
; GISEL-NEXT: .LBB0_6: ; %Flow1
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
; GISEL-NEXT: .LBB0_7: ; %Flow2
@@ -604,31 +605,31 @@ define i128 @fptoui_f64_to_i128(double %x) {
; GISEL-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6
; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5]
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
-; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v9, 0
+; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0
; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffb8d, v6
; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7
; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5]
; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v9, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v8, 0
-; GISEL-NEXT: v_mov_b32_e32 v2, v6
-; GISEL-NEXT: v_mul_lo_u32 v6, v10, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v10, v9, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v4, v11, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0
+; GISEL-NEXT: v_mov_b32_e32 v2, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9
; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7]
+; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr9
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr6
; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GISEL-NEXT: ; implicit-def: $vgpr8
@@ -642,17 +643,18 @@ define i128 @fptoui_f64_to_i128(double %x) {
; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v4, v9, 0
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3]
-; GISEL-NEXT: v_mul_lo_u32 v6, v5, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v4, v4, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7]
-; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v6, v9, 0
+; GISEL-NEXT: v_cndmask_b32_e32 v7, v1, v5, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v6, v8, 0
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v9, v[2:3]
+; GISEL-NEXT: v_mul_lo_u32 v10, v7, v9
+; GISEL-NEXT: v_mov_b32_e32 v2, v4
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], vcc, v6, v9, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v6, v6, v9
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[3:4]
+; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v5, v6, s[6:7]
+; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v10, vcc
; GISEL-NEXT: .LBB1_6: ; %Flow1
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
; GISEL-NEXT: .LBB1_7: ; %Flow2
@@ -962,31 +964,31 @@ define i128 @fptosi_f32_to_i128(float %x) {
; GISEL-NEXT: v_add_u32_e32 v7, 0xffffff6a, v6
; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5]
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
-; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v8, 0
+; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v8, 0
; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff2a, v6
; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7
; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5]
; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v8, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v9, 0
-; GISEL-NEXT: v_mov_b32_e32 v2, v6
-; GISEL-NEXT: v_mul_lo_u32 v6, v10, v8
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v10, v8, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v4, v11, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v9, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v9, 0
+; GISEL-NEXT: v_mov_b32_e32 v2, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v12, v8
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v4, v13, v8
; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v9, v[6:7]
+; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v8, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr8
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v9, v[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr6
; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GISEL-NEXT: ; implicit-def: $vgpr9
@@ -999,12 +1001,14 @@ define i128 @fptosi_f32_to_i128(float %x) {
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v8, 0
-; GISEL-NEXT: v_mul_lo_u32 v5, v4, v8
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v6, v9, 0
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v6, v8, 0
+; GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GISEL-NEXT: v_mul_lo_u32 v7, v6, v8
+; GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v6, v8, v[4:5]
+; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
; GISEL-NEXT: .LBB2_6: ; %Flow1
; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
; GISEL-NEXT: .LBB2_7: ; %Flow2
@@ -1314,31 +1318,31 @@ define i128 @fptoui_f32_to_i128(float %x) {
; GISEL-NEXT: v_add_u32_e32 v7, 0xffffff6a, v6
; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5]
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
-; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v8, 0
+; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v8, 0
; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff2a, v6
; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7
; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5]
; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v8, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v9, 0
-; GISEL-NEXT: v_mov_b32_e32 v2, v6
-; GISEL-NEXT: v_mul_lo_u32 v6, v10, v8
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v10, v8, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v4, v11, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v9, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v9, 0
+; GISEL-NEXT: v_mov_b32_e32 v2, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v12, v8
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v4, v13, v8
; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v9, v[6:7]
+; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v8, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr8
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v9, v[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr6
; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GISEL-NEXT: ; implicit-def: $vgpr9
@@ -1351,12 +1355,14 @@ define i128 @fptoui_f32_to_i128(float %x) {
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v8, 0
-; GISEL-NEXT: v_mul_lo_u32 v5, v4, v8
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v6, v9, 0
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v6, v8, 0
+; GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GISEL-NEXT: v_mul_lo_u32 v7, v6, v8
+; GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v6, v8, v[4:5]
+; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
; GISEL-NEXT: .LBB3_6: ; %Flow1
; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
; GISEL-NEXT: .LBB3_7: ; %Flow2
@@ -1702,31 +1708,31 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
; GISEL-NEXT: v_add_u32_e32 v10, 0xffffff7a, v5
; GISEL-NEXT: v_lshlrev_b64 v[0:1], v10, v[6:7]
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10
-; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v9, 0
+; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0
; GISEL-NEXT: v_add_u32_e32 v4, 0xffffff3a, v5
; GISEL-NEXT: v_sub_u32_e32 v2, 64, v10
; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[6:7]
; GISEL-NEXT: v_lshlrev_b64 v[4:5], v4, v[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, v2, 0, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v10, v8, v[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0
-; GISEL-NEXT: v_mov_b32_e32 v2, v6
-; GISEL-NEXT: v_mul_lo_u32 v6, v11, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v4, v12, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0
+; GISEL-NEXT: v_mov_b32_e32 v2, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9
; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7]
+; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v10, v9, v[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr5
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GISEL-NEXT: ; implicit-def: $vgpr8
; GISEL-NEXT: .LBB6_4: ; %Flow
@@ -2050,31 +2056,31 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
; GISEL-NEXT: v_add_u32_e32 v10, 0xffffff7a, v5
; GISEL-NEXT: v_lshlrev_b64 v[0:1], v10, v[6:7]
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10
-; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v9, 0
+; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0
; GISEL-NEXT: v_add_u32_e32 v4, 0xffffff3a, v5
; GISEL-NEXT: v_sub_u32_e32 v2, 64, v10
; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[6:7]
; GISEL-NEXT: v_lshlrev_b64 v[4:5], v4, v[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, v2, 0, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v10, v8, v[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0
-; GISEL-NEXT: v_mov_b32_e32 v2, v6
-; GISEL-NEXT: v_mul_lo_u32 v6, v11, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v4, v12, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0
+; GISEL-NEXT: v_mov_b32_e32 v2, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9
; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7]
+; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v10, v9, v[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr5
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GISEL-NEXT: ; implicit-def: $vgpr8
; GISEL-NEXT: .LBB7_4: ; %Flow
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index 31b6b53..f705a2f 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -5775,28 +5775,28 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX7-GISEL-LABEL: clpeak_imad_pat_i64:
; GFX7-GISEL: ; %bb.0: ; %entry
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v0
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2]
-; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, v0, v6
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v9, vcc, v4, v7, vcc
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v6
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7]
-; GFX7-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v0
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0
-; GFX7-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v4
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v2, vcc
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2]
+; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2]
+; GFX7-GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v8
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v11, vcc, v6, v9, vcc
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v5
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8]
+; GFX7-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v4
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5]
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: clpeak_imad_pat_i64:
@@ -5831,28 +5831,28 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX8-GISEL-LABEL: clpeak_imad_pat_i64:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 1, v0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2]
-; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, v0, v6
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, v4, v7, vcc
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v6
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7]
-; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 1, v0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0
-; GFX8-GISEL-NEXT: v_add_u32_e32 v7, vcc, 1, v5
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v4
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v2, vcc
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2]
+; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 1, v0
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2]
+; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, v0, v8
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v11, vcc, v6, v9, vcc
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8]
+; GFX8-GISEL-NEXT: v_add_u32_e32 v7, vcc, 1, v0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 1, v4
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5]
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-SDAG-LABEL: clpeak_imad_pat_i64:
@@ -5883,28 +5883,28 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX900-GISEL-LABEL: clpeak_imad_pat_i64:
; GFX900-GISEL: ; %bb.0: ; %entry
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2]
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, v0, v6
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, v4, v7, vcc
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v6
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7]
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v5
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v4
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v2, vcc
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2]
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v0
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2]
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, v0, v8
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, v6, v9, vcc
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v5
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8]
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v4
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v9, vcc
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5]
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-SDAG-LABEL: clpeak_imad_pat_i64:
@@ -5935,29 +5935,29 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX90A-GISEL-LABEL: clpeak_imad_pat_i64:
; GFX90A-GISEL: ; %bb.0: ; %entry
; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5]
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v4
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, v0, v6
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v8, vcc, v1, v7, vcc
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v2, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, v3, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v2, v[6:7]
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v1, vcc
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v5, v5, v2
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v4
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v5, vcc
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v6, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, v[0:1]
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v4, v3, v0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v7, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v8, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, v[2:3]
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v0
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5]
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v6
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, v0, v8
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v10, vcc, v1, v9, vcc
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v2, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v2, v[6:7]
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v6, v5, v8
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v4
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v6, vcc
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, v[0:1]
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v6, v3, v4
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[4:5]
; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v2
; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -6408,52 +6408,52 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX7-GISEL-LABEL: clpeak_imad_pat_v2i64:
; GFX7-GISEL: ; %bb.0: ; %entry
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v0
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc
-; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v2
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2]
+; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v0
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc
+; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v2
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2]
; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v3
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9]
-; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, v0, v12
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11]
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v16, vcc, v8, v13, vcc
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0
-; GFX7-GISEL-NEXT: v_add_i32_e32 v17, vcc, v2, v14
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v11
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9]
+; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, v0, v14
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11]
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v12, v15, vcc
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0
+; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, v2, v16
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v10
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2]
; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v14
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v9, v15, vcc
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15]
-; GFX7-GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v8, vcc
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2]
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v19, vcc, v8, v17, vcc
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15]
+; GFX7-GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v12, vcc
; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v2
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v5
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2]
; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v6
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v9, vcc
-; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v10
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1]
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v7
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1]
-; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v13
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v4, vcc
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2]
+; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v9
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1]
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v16, vcc
+; GFX7-GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v13
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v3, vcc
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2]
; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v3
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12]
; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v5
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -6513,52 +6513,52 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX8-GISEL-LABEL: clpeak_imad_pat_v2i64:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 1, v0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc
-; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v2
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2]
+; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v0
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 1, v2
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2]
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v3
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9]
-; GFX8-GISEL-NEXT: v_add_u32_e32 v3, vcc, v0, v12
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11]
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v16, vcc, v8, v13, vcc
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0
-; GFX8-GISEL-NEXT: v_add_u32_e32 v17, vcc, v2, v14
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v11
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9]
+; GFX8-GISEL-NEXT: v_add_u32_e32 v3, vcc, v0, v14
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11]
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v12, v15, vcc
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0
+; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, v2, v16
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v10
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2]
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v14
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v9, v15, vcc
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15]
-; GFX8-GISEL-NEXT: v_add_u32_e32 v11, vcc, 1, v0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v8, vcc
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2]
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v19, vcc, v8, v17, vcc
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15]
+; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, 1, v0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v12, vcc
; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 1, v2
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v5
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2]
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v6
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v9, vcc
-; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v10
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1]
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v7
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1]
-; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 1, v13
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v4, vcc
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2]
+; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v9
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1]
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v16, vcc
+; GFX8-GISEL-NEXT: v_add_u32_e32 v17, vcc, 1, v13
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2]
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v3
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12]
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v5
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -6610,52 +6610,52 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX900-GISEL-LABEL: clpeak_imad_pat_v2i64:
; GFX900-GISEL: ; %bb.0: ; %entry
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v0
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v2
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2]
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v0
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, 1, v2
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v3, vcc
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2]
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v3
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9]
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, v0, v12
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11]
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, v8, v13, vcc
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v17, vcc, v2, v14
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v11
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9]
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, v0, v14
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11]
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, v12, v15, vcc
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, v2, v16
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v10
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2]
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v14
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, v9, v15, vcc
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15]
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v11, vcc, 1, v0
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v8, vcc
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2]
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v19, vcc, v8, v17, vcc
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15]
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v0
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v12, vcc
; GFX900-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v2
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v5
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v8, vcc
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2]
; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v6
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v9, vcc
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v10
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1]
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v7
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1]
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, 1, v13
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v4, vcc
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2]
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v9
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1]
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v16, vcc
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v17, vcc, 1, v13
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v3, vcc
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2]
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v3
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12]
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v5
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -6707,54 +6707,54 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX90A-GISEL-LABEL: clpeak_imad_pat_v2i64:
; GFX90A-GISEL: ; %bb.0: ; %entry
; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v0
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v1, vcc
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v2
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v3, vcc
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v5, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v4, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v4, v[2:3]
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v2
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v7, 0
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, v0, v10
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v6, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v6, v[8:9]
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v14, vcc, v1, v11, vcc
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v3, v3, v8
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, v2, v12
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v4, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v10, v5, 0
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, v3, v13, vcc
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v4, v[10:11]
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v7, 0
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v9, v9, v4
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v6, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v6, v[10:11]
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v5, v5, v6
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v1, vcc
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v2
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v3, vcc
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v8
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v6, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v6, v[0:1]
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v9, vcc
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v8, v3, v0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v11, 0
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v4
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v10, v[0:1]
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v5, vcc
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v7, v7, v0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v12, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v13, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v12, v[2:3]
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v15, 0
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v0
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v2
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[2:3]
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v8
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v7, 0
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, v0, v12
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v6, v[8:9]
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, v1, v13, vcc
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v3, v3, v10
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, v2, v14
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v5, 0
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, v3, v15, vcc
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v4, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v4, v[10:11]
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, 0
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v9, v9, v12
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v6, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v6, v[10:11]
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v10, v5, v12
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v11, vcc, 1, v2
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v3, vcc
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v13, vcc, 1, v8
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v9, vcc
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v5, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v5, v[0:1]
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v12, 0
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v15, vcc, 1, v4
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v8, v3, v6
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v11, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v11, v[0:1]
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v10, vcc
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v9, v7, v4
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v14, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v13, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v13, v[4:5]
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v16, 0
; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v2
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v14, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v14, v[4:5]
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v3, v3, v4
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v15, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v15, v[4:5]
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v3, v3, v6
; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: clpeak_imad_pat_v2i64:
diff --git a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir
index fa52b96..02eda2c 100644
--- a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir
+++ b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir
@@ -6,40 +6,12 @@
# No more registers shall be defined
---
name: main
-alignment: 1
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
tracksRegLiveness: true
registers:
- - { id: 1, class: sreg_32_xm0, preferred-register: '%1' }
- - { id: 2, class: vreg_64, preferred-register: '%2' }
- - { id: 3, class: vreg_64 }
- - { id: 4, class: vreg_64 }
- - { id: 5, class: vreg_64 }
- - { id: 6, class: vreg_96 }
- - { id: 7, class: vreg_96 }
- - { id: 8, class: vreg_128 }
- - { id: 9, class: vreg_128 }
-liveins:
- - { reg: '$sgpr6', virtual-reg: '%1' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
+ - { id: 0, class: sreg_32_xm0, preferred-register: '%0' }
+ - { id: 1, class: vreg_64, preferred-register: '%1' }
body: |
- bb.0.entry:
+ bb.0:
liveins: $sgpr0, $vgpr0_vgpr1
; CHECK-LABEL: name: main
@@ -59,20 +31,21 @@ body: |
; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1_sub2:vreg_128 = COPY [[DEF2]]
; CHECK-NEXT: [[COPY3:%[0-9]+]].sub3:vreg_128 = COPY [[DEF]].sub0
; CHECK-NEXT: FLAT_STORE_DWORDX4 $vgpr0_vgpr1, [[COPY3]], 0, 0, implicit $exec, implicit $flat_scr
- %3 = IMPLICIT_DEF
- undef %4.sub0 = COPY $sgpr0
- %4.sub1 = COPY %3.sub0
- undef %5.sub0 = COPY %4.sub1
- %5.sub1 = COPY %4.sub0
- FLAT_STORE_DWORDX2 $vgpr0_vgpr1, killed %5, 0, 0, implicit $exec, implicit $flat_scr
+ %2:vreg_64 = IMPLICIT_DEF
+ undef %3.sub0:vreg_64 = COPY $sgpr0
+ %3.sub1:vreg_64 = COPY %2.sub0
+ undef %4.sub0:vreg_64 = COPY %3.sub1
+ %4.sub1:vreg_64 = COPY %3.sub0
+ FLAT_STORE_DWORDX2 $vgpr0_vgpr1, killed %4, 0, 0, implicit $exec, implicit $flat_scr
- %6 = IMPLICIT_DEF
- undef %7.sub0_sub1 = COPY %6
- %7.sub2 = COPY %3.sub0
- FLAT_STORE_DWORDX3 $vgpr0_vgpr1, killed %7, 0, 0, implicit $exec, implicit $flat_scr
+ %5:vreg_96 = IMPLICIT_DEF
+ undef %6.sub0_sub1:vreg_96 = COPY %5
+ %6.sub2:vreg_96 = COPY %2.sub0
+ FLAT_STORE_DWORDX3 $vgpr0_vgpr1, killed %6, 0, 0, implicit $exec, implicit $flat_scr
+
+ %7:vreg_128 = IMPLICIT_DEF
+ undef %8.sub0_sub1_sub2:vreg_128 = COPY %7
+ %8.sub3:vreg_128 = COPY %2.sub0
+ FLAT_STORE_DWORDX4 $vgpr0_vgpr1, killed %8, 0, 0, implicit $exec, implicit $flat_scr
- %8 = IMPLICIT_DEF
- undef %9.sub0_sub1_sub2 = COPY %8
- %9.sub3 = COPY %3.sub0
- FLAT_STORE_DWORDX4 $vgpr0_vgpr1, killed %9, 0, 0, implicit $exec, implicit $flat_scr
...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
index 1ab4cb0..d82d6bc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
@@ -781,16 +781,23 @@ define amdgpu_cs_chain void @wwm_write_to_arg_reg(<3 x i32> inreg %sgpr, ptr inr
; GISEL12-NEXT: v_dual_mov_b32 v14, v38 :: v_dual_mov_b32 v15, v39
; GISEL12-NEXT: s_wait_kmcnt 0x0
; GISEL12-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GISEL12-NEXT: v_dual_mov_b32 v24, v0 :: v_dual_mov_b32 v25, v1
-; GISEL12-NEXT: v_dual_mov_b32 v26, v2 :: v_dual_mov_b32 v27, v3
-; GISEL12-NEXT: v_dual_mov_b32 v28, v4 :: v_dual_mov_b32 v29, v5
-; GISEL12-NEXT: v_dual_mov_b32 v30, v6 :: v_dual_mov_b32 v31, v7
-; GISEL12-NEXT: v_dual_mov_b32 v32, v8 :: v_dual_mov_b32 v33, v9
-; GISEL12-NEXT: v_dual_mov_b32 v34, v10 :: v_dual_mov_b32 v35, v11
-; GISEL12-NEXT: v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v37, v13
-; GISEL12-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v39, v15
+; GISEL12-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
+; GISEL12-NEXT: v_dual_mov_b32 v42, v2 :: v_dual_mov_b32 v43, v3
+; GISEL12-NEXT: v_dual_mov_b32 v44, v4 :: v_dual_mov_b32 v45, v5
+; GISEL12-NEXT: v_dual_mov_b32 v46, v6 :: v_dual_mov_b32 v47, v7
+; GISEL12-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v49, v9
+; GISEL12-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v11
+; GISEL12-NEXT: v_dual_mov_b32 v52, v12 :: v_dual_mov_b32 v53, v13
+; GISEL12-NEXT: v_dual_mov_b32 v54, v14 :: v_dual_mov_b32 v55, v15
; GISEL12-NEXT: s_mov_b32 exec_lo, s9
-; GISEL12-NEXT: ; kill: def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $exec
+; GISEL12-NEXT: v_dual_mov_b32 v24, v40 :: v_dual_mov_b32 v25, v41
+; GISEL12-NEXT: v_dual_mov_b32 v26, v42 :: v_dual_mov_b32 v27, v43
+; GISEL12-NEXT: v_dual_mov_b32 v28, v44 :: v_dual_mov_b32 v29, v45
+; GISEL12-NEXT: v_dual_mov_b32 v30, v46 :: v_dual_mov_b32 v31, v47
+; GISEL12-NEXT: v_dual_mov_b32 v32, v48 :: v_dual_mov_b32 v33, v49
+; GISEL12-NEXT: v_dual_mov_b32 v34, v50 :: v_dual_mov_b32 v35, v51
+; GISEL12-NEXT: v_dual_mov_b32 v36, v52 :: v_dual_mov_b32 v37, v53
+; GISEL12-NEXT: v_dual_mov_b32 v38, v54 :: v_dual_mov_b32 v39, v55
; GISEL12-NEXT: .LBB5_2: ; %tail
; GISEL12-NEXT: s_wait_alu 0xfffe
; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s4
@@ -946,24 +953,39 @@ define amdgpu_cs_chain void @wwm_write_to_arg_reg(<3 x i32> inreg %sgpr, ptr inr
; GISEL10-NEXT: s_mov_b64 s[2:3], s[50:51]
; GISEL10-NEXT: s_waitcnt lgkmcnt(0)
; GISEL10-NEXT: s_swappc_b64 s[30:31], s[12:13]
-; GISEL10-NEXT: v_mov_b32_e32 v24, v0
-; GISEL10-NEXT: v_mov_b32_e32 v25, v1
-; GISEL10-NEXT: v_mov_b32_e32 v26, v2
-; GISEL10-NEXT: v_mov_b32_e32 v27, v3
-; GISEL10-NEXT: v_mov_b32_e32 v28, v4
-; GISEL10-NEXT: v_mov_b32_e32 v29, v5
-; GISEL10-NEXT: v_mov_b32_e32 v30, v6
-; GISEL10-NEXT: v_mov_b32_e32 v31, v7
-; GISEL10-NEXT: v_mov_b32_e32 v32, v8
-; GISEL10-NEXT: v_mov_b32_e32 v33, v9
-; GISEL10-NEXT: v_mov_b32_e32 v34, v10
-; GISEL10-NEXT: v_mov_b32_e32 v35, v11
-; GISEL10-NEXT: v_mov_b32_e32 v36, v12
-; GISEL10-NEXT: v_mov_b32_e32 v37, v13
-; GISEL10-NEXT: v_mov_b32_e32 v38, v14
-; GISEL10-NEXT: v_mov_b32_e32 v39, v15
+; GISEL10-NEXT: v_mov_b32_e32 v40, v0
+; GISEL10-NEXT: v_mov_b32_e32 v41, v1
+; GISEL10-NEXT: v_mov_b32_e32 v42, v2
+; GISEL10-NEXT: v_mov_b32_e32 v43, v3
+; GISEL10-NEXT: v_mov_b32_e32 v44, v4
+; GISEL10-NEXT: v_mov_b32_e32 v45, v5
+; GISEL10-NEXT: v_mov_b32_e32 v46, v6
+; GISEL10-NEXT: v_mov_b32_e32 v47, v7
+; GISEL10-NEXT: v_mov_b32_e32 v48, v8
+; GISEL10-NEXT: v_mov_b32_e32 v49, v9
+; GISEL10-NEXT: v_mov_b32_e32 v50, v10
+; GISEL10-NEXT: v_mov_b32_e32 v51, v11
+; GISEL10-NEXT: v_mov_b32_e32 v52, v12
+; GISEL10-NEXT: v_mov_b32_e32 v53, v13
+; GISEL10-NEXT: v_mov_b32_e32 v54, v14
+; GISEL10-NEXT: v_mov_b32_e32 v55, v15
; GISEL10-NEXT: s_mov_b32 exec_lo, s9
-; GISEL10-NEXT: ; kill: def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $exec
+; GISEL10-NEXT: v_mov_b32_e32 v24, v40
+; GISEL10-NEXT: v_mov_b32_e32 v25, v41
+; GISEL10-NEXT: v_mov_b32_e32 v26, v42
+; GISEL10-NEXT: v_mov_b32_e32 v27, v43
+; GISEL10-NEXT: v_mov_b32_e32 v28, v44
+; GISEL10-NEXT: v_mov_b32_e32 v29, v45
+; GISEL10-NEXT: v_mov_b32_e32 v30, v46
+; GISEL10-NEXT: v_mov_b32_e32 v31, v47
+; GISEL10-NEXT: v_mov_b32_e32 v32, v48
+; GISEL10-NEXT: v_mov_b32_e32 v33, v49
+; GISEL10-NEXT: v_mov_b32_e32 v34, v50
+; GISEL10-NEXT: v_mov_b32_e32 v35, v51
+; GISEL10-NEXT: v_mov_b32_e32 v36, v52
+; GISEL10-NEXT: v_mov_b32_e32 v37, v53
+; GISEL10-NEXT: v_mov_b32_e32 v38, v54
+; GISEL10-NEXT: v_mov_b32_e32 v39, v55
; GISEL10-NEXT: .LBB5_2: ; %tail
; GISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GISEL10-NEXT: v_mov_b32_e32 v8, v24
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll
index 4fa7c29..7100522 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll
@@ -481,3 +481,15 @@ define void @dominance_not_in_program_order(ptr addrspace(7) inreg %arg) {
%lsr.iv11 = phi ptr addrspace(7) [ %arg, %.loopexit ], [ %arg, %.preheader15 ]
br label %.loopexit
}
+
+;; iree-org/iree#22551 - crash on something that reduces to the below non-canonical select.
+define ptr addrspace(7) @noncanonical_const_cond(ptr addrspace(7) %x) {
+; CHECK-LABEL: define { ptr addrspace(8), i32 } @noncanonical_const_cond
+; CHECK-SAME: ({ ptr addrspace(8), i32 } [[RET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[X_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[RET]], 0
+; CHECK-NEXT: [[X_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[RET]], 1
+; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[RET]]
+;
+ %ret = select i1 false, ptr addrspace(7) %x, ptr addrspace(7) %x
+ ret ptr addrspace(7) %ret
+}
diff --git a/llvm/test/CodeGen/AMDGPU/packetizer.ll b/llvm/test/CodeGen/AMDGPU/packetizer.ll
index aab035f..b9bf138 100644
--- a/llvm/test/CodeGen/AMDGPU/packetizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/packetizer.ll
@@ -1,13 +1,49 @@
-; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s
-; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck %s
-
-; CHECK: {{^}}test:
-; CHECK: BIT_ALIGN_INT T{{[0-9]}}.X
-; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Y
-; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Z
-; CHECK: BIT_ALIGN_INT * T{{[0-9]}}.W
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s -check-prefix=R600
+; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck %s -check-prefix=CM
define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) {
+; R600-LABEL: test:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: ADD_INT T0.Y, KC0[3].X, 1,
+; R600-NEXT: ADD_INT T0.Z, KC0[3].Y, 1,
+; R600-NEXT: ADD_INT T0.W, KC0[2].Z, 1,
+; R600-NEXT: ADD_INT * T1.W, KC0[2].W, 1,
+; R600-NEXT: BIT_ALIGN_INT T0.X, PS, PS, KC0[3].Z,
+; R600-NEXT: BIT_ALIGN_INT T1.Y, PV.W, PV.W, KC0[3].Z,
+; R600-NEXT: BIT_ALIGN_INT T0.Z, PV.Z, PV.Z, KC0[3].Z,
+; R600-NEXT: BIT_ALIGN_INT * T0.W, PV.Y, PV.Y, KC0[3].Z,
+; R600-NEXT: OR_INT T0.W, PV.W, PV.Z,
+; R600-NEXT: OR_INT * T1.W, PV.Y, PV.X,
+; R600-NEXT: OR_INT T0.X, PS, PV.W,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; CM-LABEL: test:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
+; CM-NEXT: CF_END
+; CM-NEXT: PAD
+; CM-NEXT: ALU clause starting at 4:
+; CM-NEXT: ADD_INT T0.X, KC0[3].X, 1,
+; CM-NEXT: ADD_INT T0.Y, KC0[3].Y, 1,
+; CM-NEXT: ADD_INT T0.Z, KC0[2].Z, 1,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].W, 1,
+; CM-NEXT: BIT_ALIGN_INT T1.X, PV.W, PV.W, KC0[3].Z,
+; CM-NEXT: BIT_ALIGN_INT T1.Y, PV.Z, PV.Z, KC0[3].Z,
+; CM-NEXT: BIT_ALIGN_INT T0.Z, PV.Y, PV.Y, KC0[3].Z,
+; CM-NEXT: BIT_ALIGN_INT * T0.W, PV.X, PV.X, KC0[3].Z,
+; CM-NEXT: OR_INT T0.Z, PV.W, PV.Z,
+; CM-NEXT: OR_INT * T0.W, PV.Y, PV.X,
+; CM-NEXT: OR_INT * T0.X, PV.W, PV.Z,
+; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%shl = sub i32 32, %e
%x = add i32 %x_arg, 1
diff --git a/llvm/test/CodeGen/AMDGPU/private-function.ll b/llvm/test/CodeGen/AMDGPU/private-function.ll
new file mode 100644
index 0000000..8eefc9d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/private-function.ll
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
+
+define private void @foo() {
+; CHECK-LABEL: foo:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: s_wait_expcnt 0x0
+; CHECK-NEXT: s_wait_samplecnt 0x0
+; CHECK-NEXT: s_wait_bvhcnt 0x0
+; CHECK-NEXT: s_wait_kmcnt 0x0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ ret void
+}
+
+@var = global ptr @foo
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll
deleted file mode 100644
index 05a0e39..0000000
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll
+++ /dev/null
@@ -1,325 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx1100 -passes=amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
-
-define amdgpu_kernel void @large_array_vectors_small_users(<16 x i8> %in, <16 x i8> %add, ptr addrspace(3) %out) #0 {
-; OPT-LABEL: define amdgpu_kernel void @large_array_vectors_small_users(
-; OPT-SAME: <16 x i8> [[IN:%.*]], <16 x i8> [[ADD:%.*]], ptr addrspace(3) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
-; OPT-NEXT: [[ENTRY:.*:]]
-; OPT-NEXT: [[ALLOCA:%.*]] = freeze <128 x i8> poison
-; OPT-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[IN]], i64 0
-; OPT-NEXT: [[TMP1:%.*]] = insertelement <128 x i8> [[ALLOCA]], i8 [[TMP0]], i32 0
-; OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[IN]], i64 1
-; OPT-NEXT: [[TMP3:%.*]] = insertelement <128 x i8> [[TMP1]], i8 [[TMP2]], i32 1
-; OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[IN]], i64 2
-; OPT-NEXT: [[TMP5:%.*]] = insertelement <128 x i8> [[TMP3]], i8 [[TMP4]], i32 2
-; OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[IN]], i64 3
-; OPT-NEXT: [[TMP7:%.*]] = insertelement <128 x i8> [[TMP5]], i8 [[TMP6]], i32 3
-; OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[IN]], i64 4
-; OPT-NEXT: [[TMP9:%.*]] = insertelement <128 x i8> [[TMP7]], i8 [[TMP8]], i32 4
-; OPT-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[IN]], i64 5
-; OPT-NEXT: [[TMP11:%.*]] = insertelement <128 x i8> [[TMP9]], i8 [[TMP10]], i32 5
-; OPT-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[IN]], i64 6
-; OPT-NEXT: [[TMP13:%.*]] = insertelement <128 x i8> [[TMP11]], i8 [[TMP12]], i32 6
-; OPT-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[IN]], i64 7
-; OPT-NEXT: [[TMP15:%.*]] = insertelement <128 x i8> [[TMP13]], i8 [[TMP14]], i32 7
-; OPT-NEXT: [[TMP16:%.*]] = extractelement <16 x i8> [[IN]], i64 8
-; OPT-NEXT: [[TMP17:%.*]] = insertelement <128 x i8> [[TMP15]], i8 [[TMP16]], i32 8
-; OPT-NEXT: [[TMP18:%.*]] = extractelement <16 x i8> [[IN]], i64 9
-; OPT-NEXT: [[TMP19:%.*]] = insertelement <128 x i8> [[TMP17]], i8 [[TMP18]], i32 9
-; OPT-NEXT: [[TMP20:%.*]] = extractelement <16 x i8> [[IN]], i64 10
-; OPT-NEXT: [[TMP21:%.*]] = insertelement <128 x i8> [[TMP19]], i8 [[TMP20]], i32 10
-; OPT-NEXT: [[TMP22:%.*]] = extractelement <16 x i8> [[IN]], i64 11
-; OPT-NEXT: [[TMP23:%.*]] = insertelement <128 x i8> [[TMP21]], i8 [[TMP22]], i32 11
-; OPT-NEXT: [[TMP24:%.*]] = extractelement <16 x i8> [[IN]], i64 12
-; OPT-NEXT: [[TMP25:%.*]] = insertelement <128 x i8> [[TMP23]], i8 [[TMP24]], i32 12
-; OPT-NEXT: [[TMP26:%.*]] = extractelement <16 x i8> [[IN]], i64 13
-; OPT-NEXT: [[TMP27:%.*]] = insertelement <128 x i8> [[TMP25]], i8 [[TMP26]], i32 13
-; OPT-NEXT: [[TMP28:%.*]] = extractelement <16 x i8> [[IN]], i64 14
-; OPT-NEXT: [[TMP29:%.*]] = insertelement <128 x i8> [[TMP27]], i8 [[TMP28]], i32 14
-; OPT-NEXT: [[TMP30:%.*]] = extractelement <16 x i8> [[IN]], i64 15
-; OPT-NEXT: [[TMP31:%.*]] = insertelement <128 x i8> [[TMP29]], i8 [[TMP30]], i32 15
-; OPT-NEXT: [[TMP32:%.*]] = extractelement <16 x i8> [[IN]], i64 0
-; OPT-NEXT: [[TMP33:%.*]] = insertelement <128 x i8> [[TMP31]], i8 [[TMP32]], i32 0
-; OPT-NEXT: [[TMP34:%.*]] = extractelement <16 x i8> [[IN]], i64 1
-; OPT-NEXT: [[TMP35:%.*]] = insertelement <128 x i8> [[TMP33]], i8 [[TMP34]], i32 1
-; OPT-NEXT: [[TMP36:%.*]] = extractelement <16 x i8> [[IN]], i64 2
-; OPT-NEXT: [[TMP37:%.*]] = insertelement <128 x i8> [[TMP35]], i8 [[TMP36]], i32 2
-; OPT-NEXT: [[TMP38:%.*]] = extractelement <16 x i8> [[IN]], i64 3
-; OPT-NEXT: [[TMP39:%.*]] = insertelement <128 x i8> [[TMP37]], i8 [[TMP38]], i32 3
-; OPT-NEXT: [[TMP40:%.*]] = extractelement <16 x i8> [[IN]], i64 4
-; OPT-NEXT: [[TMP41:%.*]] = insertelement <128 x i8> [[TMP39]], i8 [[TMP40]], i32 4
-; OPT-NEXT: [[TMP42:%.*]] = extractelement <16 x i8> [[IN]], i64 5
-; OPT-NEXT: [[TMP43:%.*]] = insertelement <128 x i8> [[TMP41]], i8 [[TMP42]], i32 5
-; OPT-NEXT: [[TMP44:%.*]] = extractelement <16 x i8> [[IN]], i64 6
-; OPT-NEXT: [[TMP45:%.*]] = insertelement <128 x i8> [[TMP43]], i8 [[TMP44]], i32 6
-; OPT-NEXT: [[TMP46:%.*]] = extractelement <16 x i8> [[IN]], i64 7
-; OPT-NEXT: [[TMP47:%.*]] = insertelement <128 x i8> [[TMP45]], i8 [[TMP46]], i32 7
-; OPT-NEXT: [[TMP48:%.*]] = extractelement <16 x i8> [[IN]], i64 8
-; OPT-NEXT: [[TMP49:%.*]] = insertelement <128 x i8> [[TMP47]], i8 [[TMP48]], i32 8
-; OPT-NEXT: [[TMP50:%.*]] = extractelement <16 x i8> [[IN]], i64 9
-; OPT-NEXT: [[TMP51:%.*]] = insertelement <128 x i8> [[TMP49]], i8 [[TMP50]], i32 9
-; OPT-NEXT: [[TMP52:%.*]] = extractelement <16 x i8> [[IN]], i64 10
-; OPT-NEXT: [[TMP53:%.*]] = insertelement <128 x i8> [[TMP51]], i8 [[TMP52]], i32 10
-; OPT-NEXT: [[TMP54:%.*]] = extractelement <16 x i8> [[IN]], i64 11
-; OPT-NEXT: [[TMP55:%.*]] = insertelement <128 x i8> [[TMP53]], i8 [[TMP54]], i32 11
-; OPT-NEXT: [[TMP56:%.*]] = extractelement <16 x i8> [[IN]], i64 12
-; OPT-NEXT: [[TMP57:%.*]] = insertelement <128 x i8> [[TMP55]], i8 [[TMP56]], i32 12
-; OPT-NEXT: [[TMP58:%.*]] = extractelement <16 x i8> [[IN]], i64 13
-; OPT-NEXT: [[TMP59:%.*]] = insertelement <128 x i8> [[TMP57]], i8 [[TMP58]], i32 13
-; OPT-NEXT: [[TMP60:%.*]] = extractelement <16 x i8> [[IN]], i64 14
-; OPT-NEXT: [[TMP61:%.*]] = insertelement <128 x i8> [[TMP59]], i8 [[TMP60]], i32 14
-; OPT-NEXT: [[TMP62:%.*]] = extractelement <16 x i8> [[IN]], i64 15
-; OPT-NEXT: [[TMP63:%.*]] = insertelement <128 x i8> [[TMP61]], i8 [[TMP62]], i32 15
-; OPT-NEXT: [[TMP64:%.*]] = extractelement <16 x i8> [[IN]], i64 0
-; OPT-NEXT: [[TMP65:%.*]] = insertelement <128 x i8> [[TMP63]], i8 [[TMP64]], i32 0
-; OPT-NEXT: [[TMP66:%.*]] = extractelement <16 x i8> [[IN]], i64 1
-; OPT-NEXT: [[TMP67:%.*]] = insertelement <128 x i8> [[TMP65]], i8 [[TMP66]], i32 1
-; OPT-NEXT: [[TMP68:%.*]] = extractelement <16 x i8> [[IN]], i64 2
-; OPT-NEXT: [[TMP69:%.*]] = insertelement <128 x i8> [[TMP67]], i8 [[TMP68]], i32 2
-; OPT-NEXT: [[TMP70:%.*]] = extractelement <16 x i8> [[IN]], i64 3
-; OPT-NEXT: [[TMP71:%.*]] = insertelement <128 x i8> [[TMP69]], i8 [[TMP70]], i32 3
-; OPT-NEXT: [[TMP72:%.*]] = extractelement <16 x i8> [[IN]], i64 4
-; OPT-NEXT: [[TMP73:%.*]] = insertelement <128 x i8> [[TMP71]], i8 [[TMP72]], i32 4
-; OPT-NEXT: [[TMP74:%.*]] = extractelement <16 x i8> [[IN]], i64 5
-; OPT-NEXT: [[TMP75:%.*]] = insertelement <128 x i8> [[TMP73]], i8 [[TMP74]], i32 5
-; OPT-NEXT: [[TMP76:%.*]] = extractelement <16 x i8> [[IN]], i64 6
-; OPT-NEXT: [[TMP77:%.*]] = insertelement <128 x i8> [[TMP75]], i8 [[TMP76]], i32 6
-; OPT-NEXT: [[TMP78:%.*]] = extractelement <16 x i8> [[IN]], i64 7
-; OPT-NEXT: [[TMP79:%.*]] = insertelement <128 x i8> [[TMP77]], i8 [[TMP78]], i32 7
-; OPT-NEXT: [[TMP80:%.*]] = extractelement <16 x i8> [[IN]], i64 8
-; OPT-NEXT: [[TMP81:%.*]] = insertelement <128 x i8> [[TMP79]], i8 [[TMP80]], i32 8
-; OPT-NEXT: [[TMP82:%.*]] = extractelement <16 x i8> [[IN]], i64 9
-; OPT-NEXT: [[TMP83:%.*]] = insertelement <128 x i8> [[TMP81]], i8 [[TMP82]], i32 9
-; OPT-NEXT: [[TMP84:%.*]] = extractelement <16 x i8> [[IN]], i64 10
-; OPT-NEXT: [[TMP85:%.*]] = insertelement <128 x i8> [[TMP83]], i8 [[TMP84]], i32 10
-; OPT-NEXT: [[TMP86:%.*]] = extractelement <16 x i8> [[IN]], i64 11
-; OPT-NEXT: [[TMP87:%.*]] = insertelement <128 x i8> [[TMP85]], i8 [[TMP86]], i32 11
-; OPT-NEXT: [[TMP88:%.*]] = extractelement <16 x i8> [[IN]], i64 12
-; OPT-NEXT: [[TMP89:%.*]] = insertelement <128 x i8> [[TMP87]], i8 [[TMP88]], i32 12
-; OPT-NEXT: [[TMP90:%.*]] = extractelement <16 x i8> [[IN]], i64 13
-; OPT-NEXT: [[TMP91:%.*]] = insertelement <128 x i8> [[TMP89]], i8 [[TMP90]], i32 13
-; OPT-NEXT: [[TMP92:%.*]] = extractelement <16 x i8> [[IN]], i64 14
-; OPT-NEXT: [[TMP93:%.*]] = insertelement <128 x i8> [[TMP91]], i8 [[TMP92]], i32 14
-; OPT-NEXT: [[TMP94:%.*]] = extractelement <16 x i8> [[IN]], i64 15
-; OPT-NEXT: [[TMP95:%.*]] = insertelement <128 x i8> [[TMP93]], i8 [[TMP94]], i32 15
-; OPT-NEXT: [[TMP96:%.*]] = extractelement <16 x i8> [[IN]], i64 0
-; OPT-NEXT: [[TMP97:%.*]] = insertelement <128 x i8> [[TMP95]], i8 [[TMP96]], i32 0
-; OPT-NEXT: [[TMP98:%.*]] = extractelement <16 x i8> [[IN]], i64 1
-; OPT-NEXT: [[TMP99:%.*]] = insertelement <128 x i8> [[TMP97]], i8 [[TMP98]], i32 1
-; OPT-NEXT: [[TMP100:%.*]] = extractelement <16 x i8> [[IN]], i64 2
-; OPT-NEXT: [[TMP101:%.*]] = insertelement <128 x i8> [[TMP99]], i8 [[TMP100]], i32 2
-; OPT-NEXT: [[TMP102:%.*]] = extractelement <16 x i8> [[IN]], i64 3
-; OPT-NEXT: [[TMP103:%.*]] = insertelement <128 x i8> [[TMP101]], i8 [[TMP102]], i32 3
-; OPT-NEXT: [[TMP104:%.*]] = extractelement <16 x i8> [[IN]], i64 4
-; OPT-NEXT: [[TMP105:%.*]] = insertelement <128 x i8> [[TMP103]], i8 [[TMP104]], i32 4
-; OPT-NEXT: [[TMP106:%.*]] = extractelement <16 x i8> [[IN]], i64 5
-; OPT-NEXT: [[TMP107:%.*]] = insertelement <128 x i8> [[TMP105]], i8 [[TMP106]], i32 5
-; OPT-NEXT: [[TMP108:%.*]] = extractelement <16 x i8> [[IN]], i64 6
-; OPT-NEXT: [[TMP109:%.*]] = insertelement <128 x i8> [[TMP107]], i8 [[TMP108]], i32 6
-; OPT-NEXT: [[TMP110:%.*]] = extractelement <16 x i8> [[IN]], i64 7
-; OPT-NEXT: [[TMP111:%.*]] = insertelement <128 x i8> [[TMP109]], i8 [[TMP110]], i32 7
-; OPT-NEXT: [[TMP112:%.*]] = extractelement <16 x i8> [[IN]], i64 8
-; OPT-NEXT: [[TMP113:%.*]] = insertelement <128 x i8> [[TMP111]], i8 [[TMP112]], i32 8
-; OPT-NEXT: [[TMP114:%.*]] = extractelement <16 x i8> [[IN]], i64 9
-; OPT-NEXT: [[TMP115:%.*]] = insertelement <128 x i8> [[TMP113]], i8 [[TMP114]], i32 9
-; OPT-NEXT: [[TMP116:%.*]] = extractelement <16 x i8> [[IN]], i64 10
-; OPT-NEXT: [[TMP117:%.*]] = insertelement <128 x i8> [[TMP115]], i8 [[TMP116]], i32 10
-; OPT-NEXT: [[TMP118:%.*]] = extractelement <16 x i8> [[IN]], i64 11
-; OPT-NEXT: [[TMP119:%.*]] = insertelement <128 x i8> [[TMP117]], i8 [[TMP118]], i32 11
-; OPT-NEXT: [[TMP120:%.*]] = extractelement <16 x i8> [[IN]], i64 12
-; OPT-NEXT: [[TMP121:%.*]] = insertelement <128 x i8> [[TMP119]], i8 [[TMP120]], i32 12
-; OPT-NEXT: [[TMP122:%.*]] = extractelement <16 x i8> [[IN]], i64 13
-; OPT-NEXT: [[TMP123:%.*]] = insertelement <128 x i8> [[TMP121]], i8 [[TMP122]], i32 13
-; OPT-NEXT: [[TMP124:%.*]] = extractelement <16 x i8> [[IN]], i64 14
-; OPT-NEXT: [[TMP125:%.*]] = insertelement <128 x i8> [[TMP123]], i8 [[TMP124]], i32 14
-; OPT-NEXT: [[TMP126:%.*]] = extractelement <16 x i8> [[IN]], i64 15
-; OPT-NEXT: [[TMP127:%.*]] = insertelement <128 x i8> [[TMP125]], i8 [[TMP126]], i32 15
-; OPT-NEXT: [[TMP128:%.*]] = extractelement <16 x i8> [[IN]], i64 0
-; OPT-NEXT: [[TMP129:%.*]] = insertelement <128 x i8> [[TMP127]], i8 [[TMP128]], i32 0
-; OPT-NEXT: [[TMP130:%.*]] = extractelement <16 x i8> [[IN]], i64 1
-; OPT-NEXT: [[TMP131:%.*]] = insertelement <128 x i8> [[TMP129]], i8 [[TMP130]], i32 1
-; OPT-NEXT: [[TMP132:%.*]] = extractelement <16 x i8> [[IN]], i64 2
-; OPT-NEXT: [[TMP133:%.*]] = insertelement <128 x i8> [[TMP131]], i8 [[TMP132]], i32 2
-; OPT-NEXT: [[TMP134:%.*]] = extractelement <16 x i8> [[IN]], i64 3
-; OPT-NEXT: [[TMP135:%.*]] = insertelement <128 x i8> [[TMP133]], i8 [[TMP134]], i32 3
-; OPT-NEXT: [[TMP136:%.*]] = extractelement <16 x i8> [[IN]], i64 4
-; OPT-NEXT: [[TMP137:%.*]] = insertelement <128 x i8> [[TMP135]], i8 [[TMP136]], i32 4
-; OPT-NEXT: [[TMP138:%.*]] = extractelement <16 x i8> [[IN]], i64 5
-; OPT-NEXT: [[TMP139:%.*]] = insertelement <128 x i8> [[TMP137]], i8 [[TMP138]], i32 5
-; OPT-NEXT: [[TMP140:%.*]] = extractelement <16 x i8> [[IN]], i64 6
-; OPT-NEXT: [[TMP141:%.*]] = insertelement <128 x i8> [[TMP139]], i8 [[TMP140]], i32 6
-; OPT-NEXT: [[TMP142:%.*]] = extractelement <16 x i8> [[IN]], i64 7
-; OPT-NEXT: [[TMP143:%.*]] = insertelement <128 x i8> [[TMP141]], i8 [[TMP142]], i32 7
-; OPT-NEXT: [[TMP144:%.*]] = extractelement <16 x i8> [[IN]], i64 8
-; OPT-NEXT: [[TMP145:%.*]] = insertelement <128 x i8> [[TMP143]], i8 [[TMP144]], i32 8
-; OPT-NEXT: [[TMP146:%.*]] = extractelement <16 x i8> [[IN]], i64 9
-; OPT-NEXT: [[TMP147:%.*]] = insertelement <128 x i8> [[TMP145]], i8 [[TMP146]], i32 9
-; OPT-NEXT: [[TMP148:%.*]] = extractelement <16 x i8> [[IN]], i64 10
-; OPT-NEXT: [[TMP149:%.*]] = insertelement <128 x i8> [[TMP147]], i8 [[TMP148]], i32 10
-; OPT-NEXT: [[TMP150:%.*]] = extractelement <16 x i8> [[IN]], i64 11
-; OPT-NEXT: [[TMP151:%.*]] = insertelement <128 x i8> [[TMP149]], i8 [[TMP150]], i32 11
-; OPT-NEXT: [[TMP152:%.*]] = extractelement <16 x i8> [[IN]], i64 12
-; OPT-NEXT: [[TMP153:%.*]] = insertelement <128 x i8> [[TMP151]], i8 [[TMP152]], i32 12
-; OPT-NEXT: [[TMP154:%.*]] = extractelement <16 x i8> [[IN]], i64 13
-; OPT-NEXT: [[TMP155:%.*]] = insertelement <128 x i8> [[TMP153]], i8 [[TMP154]], i32 13
-; OPT-NEXT: [[TMP156:%.*]] = extractelement <16 x i8> [[IN]], i64 14
-; OPT-NEXT: [[TMP157:%.*]] = insertelement <128 x i8> [[TMP155]], i8 [[TMP156]], i32 14
-; OPT-NEXT: [[TMP158:%.*]] = extractelement <16 x i8> [[IN]], i64 15
-; OPT-NEXT: [[TMP159:%.*]] = insertelement <128 x i8> [[TMP157]], i8 [[TMP158]], i32 15
-; OPT-NEXT: [[TMP160:%.*]] = extractelement <16 x i8> [[IN]], i64 0
-; OPT-NEXT: [[TMP161:%.*]] = insertelement <128 x i8> [[TMP159]], i8 [[TMP160]], i32 0
-; OPT-NEXT: [[TMP162:%.*]] = extractelement <16 x i8> [[IN]], i64 1
-; OPT-NEXT: [[TMP163:%.*]] = insertelement <128 x i8> [[TMP161]], i8 [[TMP162]], i32 1
-; OPT-NEXT: [[TMP164:%.*]] = extractelement <16 x i8> [[IN]], i64 2
-; OPT-NEXT: [[TMP165:%.*]] = insertelement <128 x i8> [[TMP163]], i8 [[TMP164]], i32 2
-; OPT-NEXT: [[TMP166:%.*]] = extractelement <16 x i8> [[IN]], i64 3
-; OPT-NEXT: [[TMP167:%.*]] = insertelement <128 x i8> [[TMP165]], i8 [[TMP166]], i32 3
-; OPT-NEXT: [[TMP168:%.*]] = extractelement <16 x i8> [[IN]], i64 4
-; OPT-NEXT: [[TMP169:%.*]] = insertelement <128 x i8> [[TMP167]], i8 [[TMP168]], i32 4
-; OPT-NEXT: [[TMP170:%.*]] = extractelement <16 x i8> [[IN]], i64 5
-; OPT-NEXT: [[TMP171:%.*]] = insertelement <128 x i8> [[TMP169]], i8 [[TMP170]], i32 5
-; OPT-NEXT: [[TMP172:%.*]] = extractelement <16 x i8> [[IN]], i64 6
-; OPT-NEXT: [[TMP173:%.*]] = insertelement <128 x i8> [[TMP171]], i8 [[TMP172]], i32 6
-; OPT-NEXT: [[TMP174:%.*]] = extractelement <16 x i8> [[IN]], i64 7
-; OPT-NEXT: [[TMP175:%.*]] = insertelement <128 x i8> [[TMP173]], i8 [[TMP174]], i32 7
-; OPT-NEXT: [[TMP176:%.*]] = extractelement <16 x i8> [[IN]], i64 8
-; OPT-NEXT: [[TMP177:%.*]] = insertelement <128 x i8> [[TMP175]], i8 [[TMP176]], i32 8
-; OPT-NEXT: [[TMP178:%.*]] = extractelement <16 x i8> [[IN]], i64 9
-; OPT-NEXT: [[TMP179:%.*]] = insertelement <128 x i8> [[TMP177]], i8 [[TMP178]], i32 9
-; OPT-NEXT: [[TMP180:%.*]] = extractelement <16 x i8> [[IN]], i64 10
-; OPT-NEXT: [[TMP181:%.*]] = insertelement <128 x i8> [[TMP179]], i8 [[TMP180]], i32 10
-; OPT-NEXT: [[TMP182:%.*]] = extractelement <16 x i8> [[IN]], i64 11
-; OPT-NEXT: [[TMP183:%.*]] = insertelement <128 x i8> [[TMP181]], i8 [[TMP182]], i32 11
-; OPT-NEXT: [[TMP184:%.*]] = extractelement <16 x i8> [[IN]], i64 12
-; OPT-NEXT: [[TMP185:%.*]] = insertelement <128 x i8> [[TMP183]], i8 [[TMP184]], i32 12
-; OPT-NEXT: [[TMP186:%.*]] = extractelement <16 x i8> [[IN]], i64 13
-; OPT-NEXT: [[TMP187:%.*]] = insertelement <128 x i8> [[TMP185]], i8 [[TMP186]], i32 13
-; OPT-NEXT: [[TMP188:%.*]] = extractelement <16 x i8> [[IN]], i64 14
-; OPT-NEXT: [[TMP189:%.*]] = insertelement <128 x i8> [[TMP187]], i8 [[TMP188]], i32 14
-; OPT-NEXT: [[TMP190:%.*]] = extractelement <16 x i8> [[IN]], i64 15
-; OPT-NEXT: [[TMP191:%.*]] = insertelement <128 x i8> [[TMP189]], i8 [[TMP190]], i32 15
-; OPT-NEXT: [[TMP192:%.*]] = extractelement <16 x i8> [[IN]], i64 0
-; OPT-NEXT: [[TMP193:%.*]] = insertelement <128 x i8> [[TMP191]], i8 [[TMP192]], i32 0
-; OPT-NEXT: [[TMP194:%.*]] = extractelement <16 x i8> [[IN]], i64 1
-; OPT-NEXT: [[TMP195:%.*]] = insertelement <128 x i8> [[TMP193]], i8 [[TMP194]], i32 1
-; OPT-NEXT: [[TMP196:%.*]] = extractelement <16 x i8> [[IN]], i64 2
-; OPT-NEXT: [[TMP197:%.*]] = insertelement <128 x i8> [[TMP195]], i8 [[TMP196]], i32 2
-; OPT-NEXT: [[TMP198:%.*]] = extractelement <16 x i8> [[IN]], i64 3
-; OPT-NEXT: [[TMP199:%.*]] = insertelement <128 x i8> [[TMP197]], i8 [[TMP198]], i32 3
-; OPT-NEXT: [[TMP200:%.*]] = extractelement <16 x i8> [[IN]], i64 4
-; OPT-NEXT: [[TMP201:%.*]] = insertelement <128 x i8> [[TMP199]], i8 [[TMP200]], i32 4
-; OPT-NEXT: [[TMP202:%.*]] = extractelement <16 x i8> [[IN]], i64 5
-; OPT-NEXT: [[TMP203:%.*]] = insertelement <128 x i8> [[TMP201]], i8 [[TMP202]], i32 5
-; OPT-NEXT: [[TMP204:%.*]] = extractelement <16 x i8> [[IN]], i64 6
-; OPT-NEXT: [[TMP205:%.*]] = insertelement <128 x i8> [[TMP203]], i8 [[TMP204]], i32 6
-; OPT-NEXT: [[TMP206:%.*]] = extractelement <16 x i8> [[IN]], i64 7
-; OPT-NEXT: [[TMP207:%.*]] = insertelement <128 x i8> [[TMP205]], i8 [[TMP206]], i32 7
-; OPT-NEXT: [[TMP208:%.*]] = extractelement <16 x i8> [[IN]], i64 8
-; OPT-NEXT: [[TMP209:%.*]] = insertelement <128 x i8> [[TMP207]], i8 [[TMP208]], i32 8
-; OPT-NEXT: [[TMP210:%.*]] = extractelement <16 x i8> [[IN]], i64 9
-; OPT-NEXT: [[TMP211:%.*]] = insertelement <128 x i8> [[TMP209]], i8 [[TMP210]], i32 9
-; OPT-NEXT: [[TMP212:%.*]] = extractelement <16 x i8> [[IN]], i64 10
-; OPT-NEXT: [[TMP213:%.*]] = insertelement <128 x i8> [[TMP211]], i8 [[TMP212]], i32 10
-; OPT-NEXT: [[TMP214:%.*]] = extractelement <16 x i8> [[IN]], i64 11
-; OPT-NEXT: [[TMP215:%.*]] = insertelement <128 x i8> [[TMP213]], i8 [[TMP214]], i32 11
-; OPT-NEXT: [[TMP216:%.*]] = extractelement <16 x i8> [[IN]], i64 12
-; OPT-NEXT: [[TMP217:%.*]] = insertelement <128 x i8> [[TMP215]], i8 [[TMP216]], i32 12
-; OPT-NEXT: [[TMP218:%.*]] = extractelement <16 x i8> [[IN]], i64 13
-; OPT-NEXT: [[TMP219:%.*]] = insertelement <128 x i8> [[TMP217]], i8 [[TMP218]], i32 13
-; OPT-NEXT: [[TMP220:%.*]] = extractelement <16 x i8> [[IN]], i64 14
-; OPT-NEXT: [[TMP221:%.*]] = insertelement <128 x i8> [[TMP219]], i8 [[TMP220]], i32 14
-; OPT-NEXT: [[TMP222:%.*]] = extractelement <16 x i8> [[IN]], i64 15
-; OPT-NEXT: [[TMP223:%.*]] = insertelement <128 x i8> [[TMP221]], i8 [[TMP222]], i32 15
-; OPT-NEXT: [[TMP224:%.*]] = extractelement <16 x i8> [[IN]], i64 0
-; OPT-NEXT: [[TMP225:%.*]] = insertelement <128 x i8> [[TMP223]], i8 [[TMP224]], i32 0
-; OPT-NEXT: [[TMP226:%.*]] = extractelement <16 x i8> [[IN]], i64 1
-; OPT-NEXT: [[TMP227:%.*]] = insertelement <128 x i8> [[TMP225]], i8 [[TMP226]], i32 1
-; OPT-NEXT: [[TMP228:%.*]] = extractelement <16 x i8> [[IN]], i64 2
-; OPT-NEXT: [[TMP229:%.*]] = insertelement <128 x i8> [[TMP227]], i8 [[TMP228]], i32 2
-; OPT-NEXT: [[TMP230:%.*]] = extractelement <16 x i8> [[IN]], i64 3
-; OPT-NEXT: [[TMP231:%.*]] = insertelement <128 x i8> [[TMP229]], i8 [[TMP230]], i32 3
-; OPT-NEXT: [[TMP232:%.*]] = extractelement <16 x i8> [[IN]], i64 4
-; OPT-NEXT: [[TMP233:%.*]] = insertelement <128 x i8> [[TMP231]], i8 [[TMP232]], i32 4
-; OPT-NEXT: [[TMP234:%.*]] = extractelement <16 x i8> [[IN]], i64 5
-; OPT-NEXT: [[TMP235:%.*]] = insertelement <128 x i8> [[TMP233]], i8 [[TMP234]], i32 5
-; OPT-NEXT: [[TMP236:%.*]] = extractelement <16 x i8> [[IN]], i64 6
-; OPT-NEXT: [[TMP237:%.*]] = insertelement <128 x i8> [[TMP235]], i8 [[TMP236]], i32 6
-; OPT-NEXT: [[TMP238:%.*]] = extractelement <16 x i8> [[IN]], i64 7
-; OPT-NEXT: [[TMP239:%.*]] = insertelement <128 x i8> [[TMP237]], i8 [[TMP238]], i32 7
-; OPT-NEXT: [[TMP240:%.*]] = extractelement <16 x i8> [[IN]], i64 8
-; OPT-NEXT: [[TMP241:%.*]] = insertelement <128 x i8> [[TMP239]], i8 [[TMP240]], i32 8
-; OPT-NEXT: [[TMP242:%.*]] = extractelement <16 x i8> [[IN]], i64 9
-; OPT-NEXT: [[TMP243:%.*]] = insertelement <128 x i8> [[TMP241]], i8 [[TMP242]], i32 9
-; OPT-NEXT: [[TMP244:%.*]] = extractelement <16 x i8> [[IN]], i64 10
-; OPT-NEXT: [[TMP245:%.*]] = insertelement <128 x i8> [[TMP243]], i8 [[TMP244]], i32 10
-; OPT-NEXT: [[TMP246:%.*]] = extractelement <16 x i8> [[IN]], i64 11
-; OPT-NEXT: [[TMP247:%.*]] = insertelement <128 x i8> [[TMP245]], i8 [[TMP246]], i32 11
-; OPT-NEXT: [[TMP248:%.*]] = extractelement <16 x i8> [[IN]], i64 12
-; OPT-NEXT: [[TMP249:%.*]] = insertelement <128 x i8> [[TMP247]], i8 [[TMP248]], i32 12
-; OPT-NEXT: [[TMP250:%.*]] = extractelement <16 x i8> [[IN]], i64 13
-; OPT-NEXT: [[TMP251:%.*]] = insertelement <128 x i8> [[TMP249]], i8 [[TMP250]], i32 13
-; OPT-NEXT: [[TMP252:%.*]] = extractelement <16 x i8> [[IN]], i64 14
-; OPT-NEXT: [[TMP253:%.*]] = insertelement <128 x i8> [[TMP251]], i8 [[TMP252]], i32 14
-; OPT-NEXT: [[TMP254:%.*]] = extractelement <16 x i8> [[IN]], i64 15
-; OPT-NEXT: [[TMP255:%.*]] = insertelement <128 x i8> [[TMP253]], i8 [[TMP254]], i32 15
-; OPT-NEXT: [[TMP256:%.*]] = extractelement <128 x i8> [[TMP255]], i32 80
-; OPT-NEXT: [[TMP257:%.*]] = insertelement <16 x i8> poison, i8 [[TMP256]], i64 0
-; OPT-NEXT: [[TMP258:%.*]] = extractelement <128 x i8> [[TMP255]], i32 81
-; OPT-NEXT: [[TMP259:%.*]] = insertelement <16 x i8> [[TMP257]], i8 [[TMP258]], i64 1
-; OPT-NEXT: [[TMP260:%.*]] = extractelement <128 x i8> [[TMP255]], i32 82
-; OPT-NEXT: [[TMP261:%.*]] = insertelement <16 x i8> [[TMP259]], i8 [[TMP260]], i64 2
-; OPT-NEXT: [[TMP262:%.*]] = extractelement <128 x i8> [[TMP255]], i32 83
-; OPT-NEXT: [[TMP263:%.*]] = insertelement <16 x i8> [[TMP261]], i8 [[TMP262]], i64 3
-; OPT-NEXT: [[TMP264:%.*]] = extractelement <128 x i8> [[TMP255]], i32 84
-; OPT-NEXT: [[TMP265:%.*]] = insertelement <16 x i8> [[TMP263]], i8 [[TMP264]], i64 4
-; OPT-NEXT: [[TMP266:%.*]] = extractelement <128 x i8> [[TMP255]], i32 85
-; OPT-NEXT: [[TMP267:%.*]] = insertelement <16 x i8> [[TMP265]], i8 [[TMP266]], i64 5
-; OPT-NEXT: [[TMP268:%.*]] = extractelement <128 x i8> [[TMP255]], i32 86
-; OPT-NEXT: [[TMP269:%.*]] = insertelement <16 x i8> [[TMP267]], i8 [[TMP268]], i64 6
-; OPT-NEXT: [[TMP270:%.*]] = extractelement <128 x i8> [[TMP255]], i32 87
-; OPT-NEXT: [[TMP271:%.*]] = insertelement <16 x i8> [[TMP269]], i8 [[TMP270]], i64 7
-; OPT-NEXT: [[TMP272:%.*]] = extractelement <128 x i8> [[TMP255]], i32 88
-; OPT-NEXT: [[TMP273:%.*]] = insertelement <16 x i8> [[TMP271]], i8 [[TMP272]], i64 8
-; OPT-NEXT: [[TMP274:%.*]] = extractelement <128 x i8> [[TMP255]], i32 89
-; OPT-NEXT: [[TMP275:%.*]] = insertelement <16 x i8> [[TMP273]], i8 [[TMP274]], i64 9
-; OPT-NEXT: [[TMP276:%.*]] = extractelement <128 x i8> [[TMP255]], i32 90
-; OPT-NEXT: [[TMP277:%.*]] = insertelement <16 x i8> [[TMP275]], i8 [[TMP276]], i64 10
-; OPT-NEXT: [[TMP278:%.*]] = extractelement <128 x i8> [[TMP255]], i32 91
-; OPT-NEXT: [[TMP279:%.*]] = insertelement <16 x i8> [[TMP277]], i8 [[TMP278]], i64 11
-; OPT-NEXT: [[TMP280:%.*]] = extractelement <128 x i8> [[TMP255]], i32 92
-; OPT-NEXT: [[TMP281:%.*]] = insertelement <16 x i8> [[TMP279]], i8 [[TMP280]], i64 12
-; OPT-NEXT: [[TMP282:%.*]] = extractelement <128 x i8> [[TMP255]], i32 93
-; OPT-NEXT: [[TMP283:%.*]] = insertelement <16 x i8> [[TMP281]], i8 [[TMP282]], i64 13
-; OPT-NEXT: [[TMP284:%.*]] = extractelement <128 x i8> [[TMP255]], i32 94
-; OPT-NEXT: [[TMP285:%.*]] = insertelement <16 x i8> [[TMP283]], i8 [[TMP284]], i64 14
-; OPT-NEXT: [[TMP286:%.*]] = extractelement <128 x i8> [[TMP255]], i32 95
-; OPT-NEXT: [[TMP287:%.*]] = insertelement <16 x i8> [[TMP285]], i8 [[TMP286]], i64 15
-; OPT-NEXT: [[SUM:%.*]] = add <16 x i8> [[TMP287]], [[ADD]]
-; OPT-NEXT: store <16 x i8> [[SUM]], ptr addrspace(3) [[OUT]], align 16
-; OPT-NEXT: ret void
-;
-entry:
- %alloca = alloca [8 x <16 x i8>], align 16, addrspace(5)
- %gep0 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 0
- store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
- %gep1 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 1
- store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
- %gep2 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 2
- store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
- %gep3 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 3
- store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
- %gep4 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 4
- store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
- %gep5 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 5
- store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
- %gep6 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 6
- store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
- %gep7 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 7
- store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
- %load = load <16 x i8>, ptr addrspace(5) %gep5, align 16
- %sum = add <16 x i8> %load, %add
- store <16 x i8> %sum, ptr addrspace(3) %out, align 16
- ret void
-}
-
-attributes #0 = {"amdgpu-waves-per-eu"="2,2"}
diff --git a/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir b/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir
new file mode 100644
index 0000000..381cb8c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir
@@ -0,0 +1,131 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=register-coalescer -verify-coalescing -o - %s | FileCheck %s
+
+# This test is to check fix for failure with "Bad machine code: Defining instruction does not modify register" due to corrupt lane mask.
+
+---
+name: reg_coalescer_subreg_liveness
+tracksRegLiveness: true
+liveins:
+body: |
+ ; CHECK-LABEL: name: reg_coalescer_subreg_liveness
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: undef [[S_LOAD_DWORD_IMM:%[0-9]+]].sub1:sgpr_128 = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+ ; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1
+ ; CHECK-NEXT: undef [[S_MOV_B32_1:%[0-9]+]].sub0:sgpr_256 = S_MOV_B32 0
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_D2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_D2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_D2 [[S_LOAD_DWORD_IMM]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: $vcc_lo = COPY $exec_lo
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 1
+ ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc_lo, implicit $vcc_lo, implicit $vcc_lo
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $sgpr4_sgpr5
+
+ %0:sgpr_64 = COPY killed $sgpr4_sgpr5
+ %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed %0, 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+ %2:sreg_32 = S_MOV_B32 1
+ undef %3.sub0:sgpr_128 = COPY %2
+ %4:sreg_32 = S_MOV_B32 0
+ undef %5.sub0:sgpr_256 = COPY %4
+ TENSOR_LOAD_TO_LDS_D2 %3, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ %6:sgpr_128 = COPY killed %3
+ %6.sub1:sgpr_128 = COPY killed %1
+ %7:sreg_32 = COPY $exec_lo
+ %8:sreg_32 = COPY %2
+ %9:sreg_32 = COPY %4
+
+ bb.1:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+
+ %10:sreg_32 = COPY killed %8
+ undef %11.sub0:sgpr_128 = COPY %2
+ %11.sub1:sgpr_128 = COPY killed %10
+ %11.sub2:sgpr_128 = COPY %2
+ %11.sub3:sgpr_128 = COPY %2
+ TENSOR_LOAD_TO_LDS_D2 killed %11, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ %12:sreg_32 = COPY killed %9
+ %13:sgpr_128 = COPY %6
+ %13.sub2:sgpr_128 = COPY killed %12
+ TENSOR_LOAD_TO_LDS_D2 killed %13, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ $vcc_lo = COPY %7
+ %8:sreg_32 = COPY %4
+ %9:sreg_32 = COPY %2
+ S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc_lo, implicit $vcc_lo, implicit $vcc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+...
+---
+name: reg_coalescer_subreg_liveness_2
+tracksRegLiveness: true
+liveins:
+body: |
+ ; CHECK-LABEL: name: reg_coalescer_subreg_liveness_2
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: undef [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub1:sgpr_128 = S_LOAD_DWORD_IMM [[COPY]], 24, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1
+ ; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_256 = S_MOV_B32 0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit [[S_LOAD_DWORD_IMM]], implicit [[S_MOV_B32_]]
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $sgpr4_sgpr5
+
+ %0:sgpr_64 = COPY killed $sgpr4_sgpr5
+ %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0, 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+ %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed %0, 24, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
+ %3:sreg_32 = S_MOV_B32 1
+ undef %4.sub0:sgpr_128 = COPY %3
+ %5:sgpr_128 = COPY %4
+ %5.sub1:sgpr_128 = COPY killed %2
+ %6:sgpr_128 = COPY %5
+ %6.sub2:sgpr_128 = COPY killed %1
+ %7:sreg_32 = S_MOV_B32 0
+ undef %8.sub0:sgpr_256 = COPY %7
+ %9:sreg_32 = COPY %3
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+
+ %10:sreg_32 = COPY killed %9
+ undef %11.sub0:sgpr_128 = COPY %3
+ %11.sub1:sgpr_128 = COPY killed %10
+ S_NOP 0, implicit %5, implicit %8
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir
index 002d43f..1316569 100644
--- a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir
+++ b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -verify-machineinstrs -o - %s -debugify-and-strip-all-safe | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX
---
@@ -41,6 +42,27 @@ body: |
...
---
+name: meta_in_between
+body: |
+ bb.0:
+ ; GCN-LABEL: name: meta_in_between
+ ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+ ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; GCN-NEXT: KILL $sgpr0
+ ; GCN-NEXT: $sgpr0 = IMPLICIT_DEF
+ ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+ S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+ $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+ KILL $sgpr0
+ $sgpr0 = IMPLICIT_DEF
+ S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+ $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+...
+
+---
name: valu_write_in_between
body: |
bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
index 4d5ade4..1b4ed67 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
@@ -2481,10 +2481,11 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) {
; GFX7-GISEL: ; %bb.0: ; %entry
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v6, v1
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8]
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: test_vector_reduce_mul_v2i64:
@@ -2502,10 +2503,11 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) {
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v1
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, v1
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8]
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: test_vector_reduce_mul_v2i64:
@@ -2524,8 +2526,8 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) {
; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, v1
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v4, v3, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v4, v2, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v5, v2, v[6:7]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v5, v2, v[6:7]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v8
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: test_vector_reduce_mul_v2i64:
@@ -2626,9 +2628,9 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) {
; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v8
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[1:2]
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v4, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v2, v[8:9]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v5, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v4, v[5:6]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v2, v[8:9]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v5, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v4, v[8:9]
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: test_vector_reduce_mul_v3i64:
@@ -2654,9 +2656,9 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) {
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v8
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[1:2]
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v4, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v2, v[8:9]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v5, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v4, v[5:6]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v2, v[8:9]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v5, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v4, v[8:9]
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: test_vector_reduce_mul_v3i64:
@@ -2677,12 +2679,12 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v3, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v0, v2, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v2, v[8:9]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v7, v7, v0
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v1, v2, v[8:9]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v8, v7, v10
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v5, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v6, v4, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v7, v4, v[2:3]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v8, v4, v[2:3]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v6
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: test_vector_reduce_mul_v3i64:
@@ -2816,10 +2818,10 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) {
; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v12
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v2, v7, v[0:1]
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v11, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v6, v[15:16]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v4, v[13:14]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v2, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v11, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[15:16]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[13:14]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v17, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v11, v[3:4]
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: test_vector_reduce_mul_v4i64:
@@ -2853,10 +2855,10 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) {
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v12
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v2, v7, v[0:1]
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v11, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v6, v[15:16]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v4, v[13:14]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v2, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v11, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[15:16]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[13:14]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v17, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v11, v[3:4]
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: test_vector_reduce_mul_v4i64:
@@ -2881,16 +2883,16 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v0, v5, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v4, v[10:11]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v9, v9, v0
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v1, v4, v[10:11]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v7, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, v6, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v3, v6, v[0:1]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v2, v5, v0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v8, v2, 0
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v3, v6, v[0:1]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v5, v10
+; GFX9-GISEL-NEXT: v_add_u32_e32 v9, v9, v12
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v8, v5, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v8, v4, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v9, v4, v[2:3]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v9, v4, v[2:3]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v6
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: test_vector_reduce_mul_v4i64:
@@ -3068,31 +3070,29 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) {
; GFX7-GISEL: ; %bb.0: ; %entry
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v0, v8, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v13, v[17:18]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v12, v[17:18]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v19, v16, 0
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v13
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v13, v20
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[13:14]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v4, v[5:6]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v8, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v4, v13, v[17:18]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v5, v12, v[18:19]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v16, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v20, v22, v[5:6]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v21
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v8, v[17:18]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v2, v10, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[5:6]
; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v20
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v15, v[0:1]
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v11, v[6:7]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v14, v[0:1]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v19, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v10, v[20:21]
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v7
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v0, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v19, v[9:10]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v16, v[4:5]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v2, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v8, v[17:18]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v15, v[0:1]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v10, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v14, v[8:9]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v19, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v3, v10, v[5:6]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v8
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v0, v1, v[2:3]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v7, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v19, v[5:6]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v21, v16, v[12:13]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v2, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v7, v[8:9]
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: test_vector_reduce_mul_v8i64:
@@ -3139,31 +3139,29 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) {
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v0, v8, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v13, v[17:18]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v12, v[17:18]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v19, v16, 0
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v13
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v13, v20
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[13:14]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v4, v[5:6]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v8, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v4, v13, v[17:18]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v5, v12, v[18:19]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v16, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v20, v22, v[5:6]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v21
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v8, v[17:18]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v2, v10, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[5:6]
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v20
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v15, v[0:1]
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v11, v[6:7]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v14, v[0:1]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v19, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v10, v[20:21]
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v7
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v0, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v19, v[9:10]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v16, v[4:5]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v2, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v8, v[17:18]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v15, v[0:1]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v10, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v14, v[8:9]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v19, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v3, v10, v[5:6]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v8
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v0, v1, v[2:3]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v7, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v19, v[5:6]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v21, v16, v[12:13]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v2, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v7, v[8:9]
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: test_vector_reduce_mul_v8i64:
@@ -3204,32 +3202,32 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v0, v9, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v0, v8, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v8, v[18:19]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v1, v8, v[18:19]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v2, v11, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v17, v17, v0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, v10, v[8:9]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v3, v10, v[8:9]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v4, v13, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v10, v1, v2
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v12, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v5, v12, v[8:9]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v3, v4
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v5, v12, v[8:9]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v6, v15, 0
+; GFX9-GISEL-NEXT: v_add_u32_e32 v18, v1, v18
+; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v3, v10
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v14, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v7, v14, v[8:9]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v7, v14, v[8:9]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v17, v17, v20
+; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v5, v10
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v16, v1, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v5, v6
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v17, v2, v[8:9]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v5, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v16, v2, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v17, v2, v[8:9]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v7, v7, v2
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, v5, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v10, v4, v[0:1]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v18, v4, v[8:9]
; GFX9-GISEL-NEXT: v_add_u32_e32 v3, v3, v0
+; GFX9-GISEL-NEXT: v_add_u32_e32 v10, v7, v10
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v3, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v6, v2, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v7, v2, v[4:5]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v10, v2, v[4:5]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v6
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: test_vector_reduce_mul_v8i64:
@@ -3550,63 +3548,63 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) {
; GFX7-GISEL: ; %bb.0: ; %entry
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v8, v24, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v25, v[32:33]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v24, v[32:33]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v16, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v24, v31, 0
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v9, v33
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v24, v8, v[9:10]
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v24, v25
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v17, v[24:25]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v16, v[24:25]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v18, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v31, v[8:9]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v8, v25, v[32:33]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[35:36], s[4:5], v9, v24, v[33:34]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v16, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v8, v31, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v35, v[25:26]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v8, v9
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[4:5], v0, v17, v[8:9]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[34:35]
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v27, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v26, v[9:10]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v0, 0
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v11
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v16, v9, v[1:2]
-; GFX7-GISEL-NEXT: buffer_load_dword v9, off, s[0:3], s32
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v17
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v19, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[32:33]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v27, v[1:2]
+; GFX7-GISEL-NEXT: buffer_load_dword v27, off, s[0:3], s32
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v2, v18, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v11, v26, v[8:9]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v31, v0, 0
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v9
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v31, v33, v[1:2]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v32
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v2, v19, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[25:26]
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v0, v[24:25]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v20, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v1, v0, v[9:10]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v20, 0
; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v29, v[0:1]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v2, 0
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v18
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[0:1]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v21, v[3:4]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v12, v29, v[0:1]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v2, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[18:19]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v10
; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v12
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v14, v30, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v20, v[3:4]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v0, v[1:2]
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, v13
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v6, v22, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v2, v[0:1]
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v18
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v21, v[3:4]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v14, v30, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v0, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v20, v[12:13]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v4
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v22, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v2, v[18:19]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v3, 0
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v9, v[4:5]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v12, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v15, v30, v[4:5]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v23, v[3:4]
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v17, v1, v[2:3]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v22, v[3:4]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v32, v11, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v12, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v32, v0, v[5:6]
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v1, v[0:1]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[9:10]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v11, v[5:6]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v9, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v27, v[1:2]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v5
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v30, v[9:10]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v23, v[2:3]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v22, v[9:10]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v24, v11, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v0, v3, v[5:6]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v12, 0
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v10
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v24, v20, v[0:1]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v8, v1, v[0:1]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v3, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v17, v12, v[13:14]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v11, v[5:6]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v3, v[4:5]
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: test_vector_reduce_mul_v16i64:
@@ -3695,63 +3693,63 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) {
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v8, v24, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v25, v[32:33]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v24, v[32:33]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v16, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v24, v31, 0
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v9, v33
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v24, v8, v[9:10]
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v24, v25
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v17, v[24:25]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v16, v[24:25]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v18, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v31, v[8:9]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v8, v25, v[32:33]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[35:36], s[4:5], v9, v24, v[33:34]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v16, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v8, v31, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v35, v[25:26]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v8, v9
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[4:5], v0, v17, v[8:9]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[34:35]
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v27, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v26, v[9:10]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v0, 0
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v11
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v16, v9, v[1:2]
-; GFX8-GISEL-NEXT: buffer_load_dword v9, off, s[0:3], s32
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v17
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v19, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[32:33]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v27, v[1:2]
+; GFX8-GISEL-NEXT: buffer_load_dword v27, off, s[0:3], s32
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v2, v18, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v11, v26, v[8:9]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v31, v0, 0
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v9
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v31, v33, v[1:2]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v32
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v2, v19, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[25:26]
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v0, v[24:25]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v20, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v1, v0, v[9:10]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v20, 0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v29, v[0:1]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v2, 0
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v18
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[0:1]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v21, v[3:4]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v12, v29, v[0:1]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v2, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[18:19]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v10
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v12
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v14, v30, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v20, v[3:4]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v0, v[1:2]
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, v13
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v6, v22, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v2, v[0:1]
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v18
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v21, v[3:4]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v14, v30, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v0, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v20, v[12:13]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v22, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v2, v[18:19]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v3, 0
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v9, v[4:5]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v12, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v15, v30, v[4:5]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v23, v[3:4]
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v17, v1, v[2:3]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v22, v[3:4]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v32, v11, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v12, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v32, v0, v[5:6]
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v1, v[0:1]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[9:10]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v11, v[5:6]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v9, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v27, v[1:2]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v5
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v30, v[9:10]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v23, v[2:3]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v22, v[9:10]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v24, v11, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v0, v3, v[5:6]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v12, 0
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v10
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v24, v20, v[0:1]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v8, v1, v[0:1]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v3, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v17, v12, v[13:14]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v11, v[5:6]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v3, v[4:5]
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: test_vector_reduce_mul_v16i64:
@@ -3827,65 +3825,65 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) {
; GFX9-GISEL-NEXT: scratch_load_dword v31, off, s32
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[0:1], v0, v17, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[0:1], v0, v16, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v16, v[34:35]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[36:37], s[0:1], v1, v16, v[34:35]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v2, v19, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v33, v33, v0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v18, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, v18, v[16:17]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[0:1], v3, v18, v[16:17]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v4, v21, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v18, v1, v2
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v18, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v20, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v5, v20, v[16:17]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v5, v20, v[16:17]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v6, v23, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v19, v3, v4
+; GFX9-GISEL-NEXT: v_add_u32_e32 v20, v3, v18
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v22, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v7, v22, v[16:17]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v7, v22, v[16:17]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v8, v25, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v20, v5, v6
+; GFX9-GISEL-NEXT: v_add_u32_e32 v21, v5, v18
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v8, v24, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v9, v24, v[16:17]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v9, v24, v[16:17]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v10, v27, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v7, v8
+; GFX9-GISEL-NEXT: v_add_u32_e32 v34, v1, v34
+; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v7, v18
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v10, v26, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v11, v26, v[16:17]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v11, v26, v[16:17]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v12, v29, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v3, v9, v10
+; GFX9-GISEL-NEXT: v_add_u32_e32 v3, v9, v18
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v12, v28, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v13, v28, v[16:17]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v11, v12
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v13, v28, v[16:17]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v33, v33, v36
+; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v11, v18
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v14, v30, 0
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v14, v31, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v15, v30, v[16:17]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v15, v30, v[16:17]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v32, v1, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v9, v13, v14
+; GFX9-GISEL-NEXT: v_add_u32_e32 v11, v13, v18
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v33, v6, v[16:17]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v0, v3, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v32, v6, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v33, v6, v[16:17]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v11, v15, v6
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v0, v8, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, v3, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v18, v8, v[0:1]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v7, v7, v0
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v34, v8, v[16:17]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v2, v5, 0
+; GFX9-GISEL-NEXT: v_add_u32_e32 v13, v15, v18
+; GFX9-GISEL-NEXT: v_add_u32_e32 v15, v7, v0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v5, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v19, v10, v[2:3]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v20, v10, v[8:9]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v4, v11, 0
; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v12, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v4, v9, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v20, v12, v[4:5]
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v14, v1, 0
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v21, v12, v[8:9]
; GFX9-GISEL-NEXT: v_add_u32_e32 v3, v3, v4
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v14, v1, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v14, v0, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v11, v0, v[8:9]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v5, v0
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v13, v0, v[8:9]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v6, v3, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v6, v2, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v7, v2, v[0:1]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v2, v9, v0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v2, 0
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v15, v2, v[0:1]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v10, v5, v10
+; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v9, v6
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v4, v8, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v5, v8, v[2:3]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v10, v8, v[2:3]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v4
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: test_vector_reduce_mul_v16i64:
diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
index a1381ec..f964480 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
@@ -1069,6 +1069,51 @@ body: |
$sgpr0 = S_MOV_B32 $sgpr0
...
+# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0.
+---
+name: mixed_pending_events
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ ; GCN-LABEL: name: mixed_pending_events
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: liveins: $sgpr2, $vgpr2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: S_WAIT_LOADCNT 1
+ ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec
+ ; GCN-NEXT: S_WAIT_KMCNT 0
+ ; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
+ ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ bb.0:
+ liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+ $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ S_CBRANCH_SCC1 %bb.2, implicit $scc
+ bb.1:
+ liveins: $vgpr0_vgpr1, $sgpr2
+ $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ bb.2:
+ liveins: $sgpr2, $vgpr2
+ $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec
+ $sgpr2 = S_MOV_B32 $sgpr2
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+...
+
---
name: pending_vmem_event_between_block
tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir
new file mode 100644
index 0000000..df3e780
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir
@@ -0,0 +1,42 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-generic -run-pass=machine-sink %s -o - | FileCheck %s
+
+---
+name: wmma_test
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: wmma_test
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: early-clobber %3:vreg_256 = V_WMMA_F32_16X16X16_F16_w32_threeaddr 8, [[DEF]], 8, [[DEF1]], 8, 0, 0, 0, implicit $exec
+ ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[DEF2]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_256 = COPY %3.sub1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ %0:vreg_128 = IMPLICIT_DEF
+ %1:vreg_128 = IMPLICIT_DEF
+ %2:sreg_32 = IMPLICIT_DEF
+ early-clobber %3:vreg_256 = V_WMMA_F32_16X16X16_F16_w32_threeaddr 8, %0:vreg_128, 8, %1:vreg_128, 8, 0, 0, 0, implicit $exec
+ %4:sreg_32 = SI_IF %2:sreg_32, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ %5:vreg_256 = COPY %3.sub1:vreg_256
+
+ bb.2:
+ SI_END_CF %4:sreg_32, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
index cabd43e..9e243ae 100644
--- a/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
+++ b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
@@ -14,7 +14,6 @@ entry:
}
; CHECK: _ZL10myCallbacki:
-; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]:
define internal void @_ZL10myCallbacki(i32 %value) !type !2 {
entry:
%sink = alloca i32, align 4
@@ -33,7 +32,7 @@ entry:
;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0.
; CHECK-NEXT: .byte 1
;; Function Entry PC
-; CHECK-NEXT: .long [[LABEL_FUNC]]
+; CHECK-NEXT: .long _ZL10myCallbacki
;; Function type ID -5212364466660467813
; CHECK-NEXT: .long 1154849691
; CHECK-NEXT: .long 3081369122
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
index 3d3974e..8e8881e 100644
--- a/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
+++ b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
@@ -11,7 +11,6 @@ declare !type !1 i32 @direct_bar(i8)
declare !type !2 ptr @direct_baz(ptr)
; CHECK: ball:
-; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]:
define ptr @ball() {
entry:
call void @direct_foo()
@@ -42,7 +41,7 @@ entry:
;; Flags
; CHECK-NEXT: .byte 7
;; Function Entry PC
-; CHECK-NEXT: .long [[LABEL_FUNC]]
+; CHECK-NEXT: .long ball
;; Function type ID -- set to 0 as no type metadata attached to function.
; CHECK-NEXT: .long 0
; CHECK-NEXT: .long 0
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll b/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll
index 8036004..35e570b 100644
--- a/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll
+++ b/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll
@@ -29,6 +29,6 @@ declare !type !2 i32 @bar(i8 signext)
; CHECK: Hex dump of section '.llvm.callgraph':
; CHECK-NEXT: 0x00000000 00050000 00008e19 0b7f3326 e3000154
-; CHECK-NEXT: 0x00000010 86bc5981 4b8e3000 05100000 00a150b8
+; CHECK-NEXT: 0x00000010 86bc5981 4b8e3000 05000000 00a150b8
;; Verify that the type id 0x308e4b8159bc8654 is in section.
; CHECK-NEXT: 0x00000020 3e0cfe3c b2015486 bc59814b 8e30
diff --git a/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll b/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll
index d5a1d63..b7d5186 100644
--- a/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll
+++ b/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll
@@ -84,8 +84,8 @@ llc -march=bpf -mcpu=v4 < test.ll \
; CHECK: .cfi_endproc
; CHECK: .section .jumptables,"",@progbits
; CHECK: BPF.JT.0.0:
-; CHECK: .quad LBB0_3
+; CHECK: .quad LBB0_3-.text
; CHECK: .size BPF.JT.0.0, 8
; CHECK: BPF.JT.0.1:
-; CHECK: .quad LBB0_4
+; CHECK: .quad LBB0_4-.text
; CHECK: .size BPF.JT.0.1, 8
diff --git a/llvm/test/CodeGen/BPF/jump_table_global_var.ll b/llvm/test/CodeGen/BPF/jump_table_global_var.ll
index bbca468..71c682f 100644
--- a/llvm/test/CodeGen/BPF/jump_table_global_var.ll
+++ b/llvm/test/CodeGen/BPF/jump_table_global_var.ll
@@ -78,6 +78,6 @@ llc -march=bpf -mcpu=v4 < test.ll \
; CHECK: .cfi_endproc
; CHECK: .section .jumptables,"",@progbits
; CHECK: BPF.JT.0.0:
-; CHECK: .quad LBB0_1
-; CHECK: .quad LBB0_2
+; CHECK: .quad LBB0_1-.text
+; CHECK: .quad LBB0_2-.text
; CHECK: .size BPF.JT.0.0, 16
diff --git a/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll b/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll
index 682b025..eb1e5bf 100644
--- a/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll
+++ b/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll
@@ -93,34 +93,34 @@ llc -march=bpf -mcpu=v4 -bpf-min-jump-table-entries=3 < test.ll \
; CHECK: .cfi_endproc
; CHECK: .section .jumptables,"",@progbits
; CHECK: BPF.JT.0.0:
-; CHECK: .quad LBB0_4
-; CHECK: .quad LBB0_5
-; CHECK: .quad LBB0_5
-; CHECK: .quad LBB0_5
-; CHECK: .quad LBB0_5
-; CHECK: .quad LBB0_5
-; CHECK: .quad LBB0_5
-; CHECK: .quad LBB0_5
-; CHECK: .quad LBB0_5
-; CHECK: .quad LBB0_5
-; CHECK: .quad LBB0_5
-; CHECK: .quad LBB0_5
-; CHECK: .quad LBB0_5
-; CHECK: .quad LBB0_5
-; CHECK: .quad LBB0_5
-; CHECK: .quad LBB0_5
-; CHECK: .quad LBB0_5
-; CHECK: .quad LBB0_5
-; CHECK: .quad LBB0_5
-; CHECK: .quad LBB0_2
-; CHECK: .quad LBB0_5
-; CHECK: .quad LBB0_5
-; CHECK: .quad LBB0_5
-; CHECK: .quad LBB0_5
-; CHECK: .quad LBB0_5
-; CHECK: .quad LBB0_5
-; CHECK: .quad LBB0_5
-; CHECK: .quad LBB0_5
-; CHECK: .quad LBB0_5
-; CHECK: .quad LBB0_3
+; CHECK: .quad LBB0_4-.text
+; CHECK: .quad LBB0_5-.text
+; CHECK: .quad LBB0_5-.text
+; CHECK: .quad LBB0_5-.text
+; CHECK: .quad LBB0_5-.text
+; CHECK: .quad LBB0_5-.text
+; CHECK: .quad LBB0_5-.text
+; CHECK: .quad LBB0_5-.text
+; CHECK: .quad LBB0_5-.text
+; CHECK: .quad LBB0_5-.text
+; CHECK: .quad LBB0_5-.text
+; CHECK: .quad LBB0_5-.text
+; CHECK: .quad LBB0_5-.text
+; CHECK: .quad LBB0_5-.text
+; CHECK: .quad LBB0_5-.text
+; CHECK: .quad LBB0_5-.text
+; CHECK: .quad LBB0_5-.text
+; CHECK: .quad LBB0_5-.text
+; CHECK: .quad LBB0_5-.text
+; CHECK: .quad LBB0_2-.text
+; CHECK: .quad LBB0_5-.text
+; CHECK: .quad LBB0_5-.text
+; CHECK: .quad LBB0_5-.text
+; CHECK: .quad LBB0_5-.text
+; CHECK: .quad LBB0_5-.text
+; CHECK: .quad LBB0_5-.text
+; CHECK: .quad LBB0_5-.text
+; CHECK: .quad LBB0_5-.text
+; CHECK: .quad LBB0_5-.text
+; CHECK: .quad LBB0_3-.text
; CHECK: .size BPF.JT.0.0, 240
diff --git a/llvm/test/CodeGen/DirectX/f16tof32.ll b/llvm/test/CodeGen/DirectX/f16tof32.ll
new file mode 100644
index 0000000..edc5c19
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/f16tof32.ll
@@ -0,0 +1,57 @@
+; RUN: opt -S -dxil-intrinsic-expansion -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.9-library %s | FileCheck %s
+
+define hidden noundef nofpclass(nan inf) float @_Z11test_scalarj(i32 noundef %p0) local_unnamed_addr #0 {
+entry:
+ ; CHECK : [[UINT:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 %p0)
+ ; CHECK : ret float [[UINT]]
+ %hlsl.f16tof32 = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.legacyf16tof32.i32(i32 %p0)
+ ret float %hlsl.f16tof32
+}
+
+define hidden noundef nofpclass(nan inf) <2 x float> @_Z10test_uint2Dv2_j(<2 x i32> noundef %p0) local_unnamed_addr #0 {
+entry:
+ ; CHECK: [[UINT2_0:%.*]] = extractelement <2 x i32> %p0, i64 0
+ ; CHECK: [[FLOAT_0:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT2_0]])
+ ; CHECK: [[UINT2_1:%.*]] = extractelement <2 x i32> %p0, i64 1
+ ; CHECK: [[FLOAT_1:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT2_1]])
+ ; CHECK: [[FLOAT2_0:%.*]] = insertelement <2 x float> poison, float [[FLOAT_0]], i64 0
+ ; CHECK: [[FLOAT2_1:%.*]] = insertelement <2 x float> [[FLOAT2_0]], float [[FLOAT_1]], i64 1
+ ; CHECK : ret <2 x float> [[FLOAT2_1]]
+ %hlsl.f16tof32 = tail call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32> %p0)
+ ret <2 x float> %hlsl.f16tof32
+}
+
+define hidden noundef nofpclass(nan inf) <3 x float> @_Z10test_uint3Dv3_j(<3 x i32> noundef %p0) local_unnamed_addr #0 {
+entry:
+ ; CHECK: [[UINT3_0:%.*]] = extractelement <3 x i32> %p0, i64 0
+ ; CHECK: [[FLOAT_0:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT3_0]])
+ ; CHECK: [[UINT3_1:%.*]] = extractelement <3 x i32> %p0, i64 1
+ ; CHECK: [[FLOAT_1:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT3_1]])
+ ; CHECK: [[UINT3_2:%.*]] = extractelement <3 x i32> %p0, i64 2
+ ; CHECK: [[FLOAT_2:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT3_2]])
+ ; CHECK: [[FLOAT3_0:%.*]] = insertelement <3 x float> poison, float [[FLOAT_0]], i64 0
+ ; CHECK: [[FLOAT3_1:%.*]] = insertelement <3 x float> [[FLOAT3_0]], float [[FLOAT_1]], i64 1
+ ; CHECK: [[FLOAT3_2:%.*]] = insertelement <3 x float> [[FLOAT3_1]], float [[FLOAT_2]], i64 2
+ ; CHECK : ret <3 x float> [[FLOAT3_2]]
+ %hlsl.f16tof32 = tail call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32> %p0)
+ ret <3 x float> %hlsl.f16tof32
+}
+
+define hidden noundef nofpclass(nan inf) <4 x float> @_Z10test_uint4Dv4_j(<4 x i32> noundef %p0) local_unnamed_addr #0 {
+entry:
+ ; CHECK: [[UINT4_0:%.*]] = extractelement <4 x i32> %p0, i64 0
+ ; CHECK: [[FLOAT_0:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT4_0]])
+ ; CHECK: [[UINT4_1:%.*]] = extractelement <4 x i32> %p0, i64 1
+ ; CHECK: [[FLOAT_1:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT4_1]])
+ ; CHECK: [[UINT4_2:%.*]] = extractelement <4 x i32> %p0, i64 2
+ ; CHECK: [[FLOAT_2:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT4_2]])
+ ; CHECK: [[UINT4_3:%.*]] = extractelement <4 x i32> %p0, i64 3
+ ; CHECK: [[FLOAT_3:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT4_3]])
+ ; CHECK: [[FLOAT4_0:%.*]] = insertelement <4 x float> poison, float [[FLOAT_0]], i64 0
+ ; CHECK: [[FLOAT4_1:%.*]] = insertelement <4 x float> [[FLOAT4_0]], float [[FLOAT_1]], i64 1
+ ; CHECK: [[FLOAT4_2:%.*]] = insertelement <4 x float> [[FLOAT4_1]], float [[FLOAT_2]], i64 2
+ ; CHECK: [[FLOAT4_3:%.*]] = insertelement <4 x float> [[FLOAT4_2]], float [[FLOAT_3]], i64 3
+ ; CHECK : ret <4 x float> [[FLOAT4_3]]
+ %hlsl.f16tof32 = tail call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32> %p0)
+ ret <4 x float> %hlsl.f16tof32
+}
diff --git a/llvm/test/CodeGen/DirectX/wavesize-md-errs.ll b/llvm/test/CodeGen/DirectX/wavesize-md-errs.ll
new file mode 100644
index 0000000..9016c5d
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/wavesize-md-errs.ll
@@ -0,0 +1,31 @@
+; RUN: split-file %s %t
+; RUN: not opt -S --dxil-translate-metadata %t/low-sm.ll 2>&1 | FileCheck %t/low-sm.ll
+; RUN: not opt -S --dxil-translate-metadata %t/low-sm-for-range.ll 2>&1 | FileCheck %t/low-sm-for-range.ll
+
+; Test that wavesize metadata is only allowed on applicable shader model versions
+
+;--- low-sm.ll
+
+; CHECK: Shader model 6.6 or greater is required to specify the "hlsl.wavesize" function attribute
+
+target triple = "dxil-unknown-shadermodel6.5-compute"
+
+define void @main() #0 {
+entry:
+ ret void
+}
+
+attributes #0 = { "hlsl.wavesize"="16,0,0" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+;--- low-sm-for-range.ll
+
+; CHECK: Shader model 6.8 or greater is required to specify wave size range values of the "hlsl.wavesize" function attribute
+
+target triple = "dxil-unknown-shadermodel6.7-compute"
+
+define void @main() #0 {
+entry:
+ ret void
+}
+
+attributes #0 = { "hlsl.wavesize"="16,32,0" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
diff --git a/llvm/test/CodeGen/DirectX/wavesize-md-valid.ll b/llvm/test/CodeGen/DirectX/wavesize-md-valid.ll
new file mode 100644
index 0000000..3ad6c1d0
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/wavesize-md-valid.ll
@@ -0,0 +1,96 @@
+; RUN: split-file %s %t
+; RUN: opt -S --dxil-translate-metadata %t/only.ll | FileCheck %t/only.ll
+; RUN: opt -S --dxil-translate-metadata %t/min.ll | FileCheck %t/min.ll
+; RUN: opt -S --dxil-translate-metadata %t/max.ll | FileCheck %t/max.ll
+; RUN: opt -S --dxil-translate-metadata %t/pref.ll | FileCheck %t/pref.ll
+
+; RUN: llc --filetype=obj %t/only.ll -o - | obj2yaml | FileCheck %t/only.ll --check-prefix=OBJ
+; RUN: llc --filetype=obj %t/min.ll -o - | obj2yaml | FileCheck %t/min.ll --check-prefix=OBJ
+; RUN: llc --filetype=obj %t/max.ll -o - | obj2yaml | FileCheck %t/max.ll --check-prefix=OBJ
+; RUN: llc --filetype=obj %t/pref.ll -o - | obj2yaml | FileCheck %t/pref.ll --check-prefix=OBJ
+
+; Test that wave size/range metadata is correctly generated with the correct tag
+
+;--- only.ll
+
+; CHECK: !dx.entryPoints = !{![[#ENTRY:]]}
+; CHECK: ![[#ENTRY]] = !{ptr @main, !"main", null, null, ![[#PROPS:]]}
+; CHECK: ![[#PROPS]] = !{{{.*}}i32 11, ![[#WAVE_SIZE:]]{{.*}}}
+; CHECK: ![[#WAVE_SIZE]] = !{i32 16}
+
+; OBJ: - Name: PSV0
+; OBJ: PSVInfo:
+; OBJ: MinimumWaveLaneCount: 16
+; OBJ: MaximumWaveLaneCount: 16
+
+target triple = "dxil-unknown-shadermodel6.6-compute"
+
+define void @main() #0 {
+entry:
+ ret void
+}
+
+attributes #0 = { "hlsl.wavesize"="16,0,0" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+;--- min.ll
+
+; CHECK: !dx.entryPoints = !{![[#ENTRY:]]}
+; CHECK: ![[#ENTRY]] = !{ptr @main, !"main", null, null, ![[#PROPS:]]}
+; CHECK: ![[#PROPS]] = !{{{.*}}i32 23, ![[#WAVE_SIZE:]]{{.*}}}
+; CHECK: ![[#WAVE_SIZE]] = !{i32 16, i32 0, i32 0}
+
+; OBJ: - Name: PSV0
+; OBJ: PSVInfo:
+; OBJ: MinimumWaveLaneCount: 16
+; OBJ: MaximumWaveLaneCount: 16
+
+target triple = "dxil-unknown-shadermodel6.8-compute"
+
+define void @main() #0 {
+entry:
+ ret void
+}
+
+attributes #0 = { "hlsl.wavesize"="16,0,0" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+;--- max.ll
+
+; CHECK: !dx.entryPoints = !{![[#ENTRY:]]}
+; CHECK: ![[#ENTRY]] = !{ptr @main, !"main", null, null, ![[#PROPS:]]}
+; CHECK: ![[#PROPS]] = !{{{.*}}i32 23, ![[#WAVE_SIZE:]]{{.*}}}
+; CHECK: ![[#WAVE_SIZE]] = !{i32 16, i32 32, i32 0}
+
+; OBJ: - Name: PSV0
+; OBJ: PSVInfo:
+; OBJ: MinimumWaveLaneCount: 16
+; OBJ: MaximumWaveLaneCount: 32
+
+target triple = "dxil-unknown-shadermodel6.8-compute"
+
+define void @main() #0 {
+entry:
+ ret void
+}
+
+attributes #0 = { "hlsl.wavesize"="16,32,0" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+;--- pref.ll
+
+; CHECK: !dx.entryPoints = !{![[#ENTRY:]]}
+; CHECK: ![[#ENTRY]] = !{ptr @main, !"main", null, null, ![[#PROPS:]]}
+; CHECK: ![[#PROPS]] = !{{{.*}}i32 23, ![[#WAVE_SIZE:]]{{.*}}}
+; CHECK: ![[#WAVE_SIZE]] = !{i32 16, i32 64, i32 32}
+
+; OBJ: - Name: PSV0
+; OBJ: PSVInfo:
+; OBJ: MinimumWaveLaneCount: 16
+; OBJ: MaximumWaveLaneCount: 64
+
+target triple = "dxil-unknown-shadermodel6.8-compute"
+
+define void @main() #0 {
+entry:
+ ret void
+}
+
+attributes #0 = { "hlsl.wavesize"="16,64,32" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
diff --git a/llvm/test/CodeGen/Generic/machine-function-splitter-optnone.ll b/llvm/test/CodeGen/Generic/machine-function-splitter-optnone.ll
new file mode 100644
index 0000000..67d2ad7
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/machine-function-splitter-optnone.ll
@@ -0,0 +1,50 @@
+; REQUIRES: x86-registered-target
+
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -split-machine-functions -O0 -mfs-psi-cutoff=0 -mfs-count-threshold=10000 | FileCheck %s
+
+;; Check that functions with optnone attribute are not split.
+; CHECK-LABEL: foo_optnone:
+; CHECK-NOT: .section .text.split.foo_optnone
+; CHECK-NOT: foo_optnone.cold:
+; CHECK: .LBB0_2:
+; CHECK: .size foo_optnone
+
+define void @foo_optnone(i1 zeroext %0) nounwind optnone noinline !prof !14 !section_prefix !15 {
+entry:
+ br i1 %0, label %hot, label %cold, !prof !17
+
+hot:
+ %1 = call i32 @bar()
+ br label %exit
+
+cold:
+ %2 = call i32 @baz()
+ br label %exit
+
+exit:
+ %3 = tail call i32 @qux()
+ ret void
+}
+
+declare i32 @bar()
+declare i32 @baz()
+declare i32 @qux()
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 5}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999900, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 7000}
+!15 = !{!"function_section_prefix", !"hot"}
+!17 = !{!"branch_weights", i32 7000, i32 0}
diff --git a/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll b/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll
index 559bb68..930cf81 100644
--- a/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll
+++ b/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll
@@ -6,11 +6,11 @@
target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
target triple = "hexagon"
-define i32 @fred(ptr %a0) #0 {
+define i32 @fred(ptr %a0, i32 %cond) #0 {
; CHECK-LABEL: fred:
; CHECK: // %bb.0: // %b0
; CHECK-NEXT: {
-; CHECK-NEXT: if (p0) jump:nt .LBB0_2
+; CHECK-NEXT: p0 = cmp.eq(r1,#5); if (!p0.new) jump:t .LBB0_2
; CHECK-NEXT: }
; CHECK-NEXT: // %bb.1: // %b2
; CHECK-NEXT: {
@@ -40,7 +40,7 @@ define i32 @fred(ptr %a0) #0 {
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: }
b0:
- switch i32 undef, label %b14 [
+ switch i32 %cond, label %b14 [
i32 5, label %b2
i32 3, label %b1
]
diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll
new file mode 100644
index 0000000..006713c
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll
@@ -0,0 +1,303 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+declare <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float>)
+
+define void @lasx_cast_128_s(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_cast_128_s:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vld $vr0, $a1, 0
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %a = load <4 x float>, ptr %va
+ %b = call <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float> %a)
+ store <8 x float> %b, ptr %vd
+ ret void
+}
+
+declare <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double>)
+
+define void @lasx_cast_128_d(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_cast_128_d:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vld $vr0, $a1, 0
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %a = load <2 x double>, ptr %va
+ %b = call <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double> %a)
+ store <4 x double> %b, ptr %vd
+ ret void
+}
+
+declare <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64>)
+
+define void @lasx_cast_128(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_cast_128:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vld $vr0, $a1, 0
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %a = load <2 x i64>, ptr %va
+ %b = call <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64> %a)
+ store <4 x i64> %b, ptr %vd
+ ret void
+}
+
+declare <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float>, <4 x float>)
+
+define void @lasx_concat_128_s(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_concat_128_s:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vld $vr0, $a1, 0
+; CHECK-NEXT: vld $vr1, $a2, 0
+; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %a = load <4 x float>, ptr %va
+ %b = load <4 x float>, ptr %vb
+ %c = call <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float> %a, <4 x float> %b)
+ store <8 x float> %c, ptr %vd
+ ret void
+}
+
+declare <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double>, <2 x double>)
+
+define void @lasx_concat_128_d(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_concat_128_d:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vld $vr0, $a1, 0
+; CHECK-NEXT: vld $vr1, $a2, 0
+; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %a = load <2 x double>, ptr %va
+ %b = load <2 x double>, ptr %vb
+ %c = call <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double> %a, <2 x double> %b)
+ store <4 x double> %c, ptr %vd
+ ret void
+}
+
+declare <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64>, <2 x i64>)
+
+define void @lasx_concat_128(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_concat_128:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vld $vr0, $a1, 0
+; CHECK-NEXT: vld $vr1, $a2, 0
+; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %a = load <2 x i64>, ptr %va
+ %b = load <2 x i64>, ptr %vb
+ %c = call <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64> %a, <2 x i64> %b)
+ store <4 x i64> %c, ptr %vd
+ ret void
+}
+
+declare <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float>)
+
+define void @lasx_extract_128_lo_s(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_lo_s:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xvld $xr0, $a1, 0
+; CHECK-NEXT: vst $vr0, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %a = load <8 x float>, ptr %va
+ %c = call <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float> %a)
+ store <4 x float> %c, ptr %vd
+ ret void
+}
+
+declare <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double>)
+
+define void @lasx_extract_128_lo_d(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_lo_d:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xvld $xr0, $a1, 0
+; CHECK-NEXT: vst $vr0, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %a = load <4 x double>, ptr %va
+ %c = call <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double> %a)
+ store <2 x double> %c, ptr %vd
+ ret void
+}
+
+declare <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64>)
+
+define void @lasx_extract_128_lo(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_lo:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xvld $xr0, $a1, 0
+; CHECK-NEXT: vst $vr0, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %a = load <4 x i64>, ptr %va
+ %c = call <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64> %a)
+ store <2 x i64> %c, ptr %vd
+ ret void
+}
+
+declare <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float>)
+
+define void @lasx_extract_128_hi_s(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_hi_s:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xvld $xr0, $a1, 0
+; CHECK-NEXT: xvpermi.q $xr0, $xr0, 1
+; CHECK-NEXT: vst $vr0, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %a = load <8 x float>, ptr %va
+ %c = call <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float> %a)
+ store <4 x float> %c, ptr %vd
+ ret void
+}
+
+declare <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double>)
+
+define void @lasx_extract_128_hi_d(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_hi_d:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xvld $xr0, $a1, 0
+; CHECK-NEXT: xvpermi.q $xr0, $xr0, 1
+; CHECK-NEXT: vst $vr0, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %a = load <4 x double>, ptr %va
+ %c = call <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double> %a)
+ store <2 x double> %c, ptr %vd
+ ret void
+}
+
+declare <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64>)
+
+define void @lasx_extract_128_hi(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_hi:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xvld $xr0, $a1, 0
+; CHECK-NEXT: xvpermi.q $xr0, $xr0, 1
+; CHECK-NEXT: vst $vr0, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %a = load <4 x i64>, ptr %va
+ %c = call <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64> %a)
+ store <2 x i64> %c, ptr %vd
+ ret void
+}
+
+declare <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float>, <4 x float>)
+
+define void @lasx_insert_128_lo_s(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_lo_s:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xvld $xr0, $a1, 0
+; CHECK-NEXT: vld $vr1, $a2, 0
+; CHECK-NEXT: xvpermi.q $xr0, $xr1, 48
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %a = load <8 x float>, ptr %va
+ %b = load <4 x float>, ptr %vb
+ %c = call <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float> %a, <4 x float> %b)
+ store <8 x float> %c, ptr %vd
+ ret void
+}
+
+declare <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double>, <2 x double>)
+
+define void @lasx_insert_128_lo_d(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_lo_d:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xvld $xr0, $a1, 0
+; CHECK-NEXT: vld $vr1, $a2, 0
+; CHECK-NEXT: xvpermi.q $xr0, $xr1, 48
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %a = load <4 x double>, ptr %va
+ %b = load <2 x double>, ptr %vb
+ %c = call <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double> %a, <2 x double> %b)
+ store <4 x double> %c, ptr %vd
+ ret void
+}
+
+declare <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64>, <2 x i64>)
+
+define void @lasx_insert_128_lo(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_lo:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xvld $xr0, $a1, 0
+; CHECK-NEXT: vld $vr1, $a2, 0
+; CHECK-NEXT: xvpermi.q $xr0, $xr1, 48
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %a = load <4 x i64>, ptr %va
+ %b = load <2 x i64>, ptr %vb
+ %c = call <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64> %a, <2 x i64> %b)
+ store <4 x i64> %c, ptr %vd
+ ret void
+}
+
+declare <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float>, <4 x float>)
+
+define void @lasx_insert_128_hi_s(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_hi_s:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xvld $xr0, $a1, 0
+; CHECK-NEXT: vld $vr1, $a2, 0
+; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %a = load <8 x float>, ptr %va
+ %b = load <4 x float>, ptr %vb
+ %c = call <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float> %a, <4 x float> %b)
+ store <8 x float> %c, ptr %vd
+ ret void
+}
+
+declare <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double>, <2 x double>)
+
+define void @lasx_insert_128_hi_d(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_hi_d:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xvld $xr0, $a1, 0
+; CHECK-NEXT: vld $vr1, $a2, 0
+; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %a = load <4 x double>, ptr %va
+ %b = load <2 x double>, ptr %vb
+ %c = call <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double> %a, <2 x double> %b)
+ store <4 x double> %c, ptr %vd
+ ret void
+}
+
+declare <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64>, <2 x i64>)
+
+define void @lasx_insert_128_hi(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_hi:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xvld $xr0, $a1, 0
+; CHECK-NEXT: vld $vr1, $a2, 0
+; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %a = load <4 x i64>, ptr %va
+ %b = load <2 x i64>, ptr %vb
+ %c = call <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64> %a, <2 x i64> %b)
+ store <4 x i64> %c, ptr %vd
+ ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
index 4d930cd..3626613 100644
--- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -2,6 +2,7 @@
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM70 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM80 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | FileCheck --check-prefixes=CHECK,SM80-FTZ %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 -denormal-fp-math-f32=preserve-sign | FileCheck --check-prefixes=CHECK,SM90-FTZ %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK,SM90 %s
; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %}
; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | %ptxas-verify -arch=sm_80 %}
@@ -55,13 +56,24 @@ define bfloat @test_fadd(bfloat %0, bfloat %1) {
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_param_0];
; SM80-FTZ-NEXT: ld.param.b16 %rs2, [test_fadd_param_1];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs2;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs2;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1;
; SM80-FTZ-NEXT: add.rn.ftz.f32 %r3, %r2, %r1;
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %r3;
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fadd(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_param_0];
+; SM90-FTZ-NEXT: ld.param.b16 %rs2, [test_fadd_param_1];
+; SM90-FTZ-NEXT: add.rn.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fadd(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<4>;
@@ -118,13 +130,24 @@ define bfloat @test_fsub(bfloat %0, bfloat %1) {
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fsub_param_0];
; SM80-FTZ-NEXT: ld.param.b16 %rs2, [test_fsub_param_1];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs2;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs2;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1;
; SM80-FTZ-NEXT: sub.rn.ftz.f32 %r3, %r2, %r1;
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %r3;
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fsub(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fsub_param_0];
+; SM90-FTZ-NEXT: ld.param.b16 %rs2, [test_fsub_param_1];
+; SM90-FTZ-NEXT: sub.rn.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fsub(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<4>;
@@ -195,16 +218,27 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_faddx2_param_0];
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_faddx2_param_1];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1;
; SM80-FTZ-NEXT: add.rn.ftz.f32 %r3, %r2, %r1;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2;
; SM80-FTZ-NEXT: add.rn.ftz.f32 %r6, %r5, %r4;
; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_faddx2(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_faddx2_param_0];
+; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_faddx2_param_1];
+; SM90-FTZ-NEXT: add.rn.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_faddx2(
; SM90: {
; SM90-NEXT: .reg .b32 %r<4>;
@@ -275,16 +309,27 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fsubx2_param_0];
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fsubx2_param_1];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1;
; SM80-FTZ-NEXT: sub.rn.ftz.f32 %r3, %r2, %r1;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2;
; SM80-FTZ-NEXT: sub.rn.ftz.f32 %r6, %r5, %r4;
; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fsubx2(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_fsubx2_param_0];
+; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_fsubx2_param_1];
+; SM90-FTZ-NEXT: sub.rn.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fsubx2(
; SM90: {
; SM90-NEXT: .reg .b32 %r<4>;
@@ -355,16 +400,27 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fmulx2_param_0];
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fmulx2_param_1];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1;
; SM80-FTZ-NEXT: mul.rn.ftz.f32 %r3, %r2, %r1;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2;
; SM80-FTZ-NEXT: mul.rn.ftz.f32 %r6, %r5, %r4;
; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fmulx2(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_fmulx2_param_0];
+; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_fmulx2_param_1];
+; SM90-FTZ-NEXT: mul.rn.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fmulx2(
; SM90: {
; SM90-NEXT: .reg .b32 %r<4>;
@@ -441,16 +497,34 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0];
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1;
; SM80-FTZ-NEXT: div.rn.ftz.f32 %r3, %r2, %r1;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2;
; SM80-FTZ-NEXT: div.rn.ftz.f32 %r6, %r5, %r4;
; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fdiv(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<5>;
+; SM90-FTZ-NEXT: .reg .b32 %r<8>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0];
+; SM90-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1];
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1;
+; SM90-FTZ-NEXT: div.rn.ftz.f32 %r3, %r2, %r1;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2;
+; SM90-FTZ-NEXT: div.rn.ftz.f32 %r6, %r5, %r4;
+; SM90-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r7;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fdiv(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<5>;
@@ -527,10 +601,21 @@ define float @test_fpext_float(bfloat %a) #0 {
; SM80-FTZ-EMPTY:
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r1;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fpext_float(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<2>;
+; SM90-FTZ-NEXT: .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0];
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r1;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fpext_float(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -585,6 +670,17 @@ define bfloat @test_fptrunc_float(float %a) #0 {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fptrunc_float(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<2>;
+; SM90-FTZ-NEXT: .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_fptrunc_float_param_0];
+; SM90-FTZ-NEXT: cvt.rn.bf16.f32 %rs1, %r1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fptrunc_float(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -637,12 +733,23 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 {
; SM80-FTZ-EMPTY:
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1;
; SM80-FTZ-NEXT: add.rn.ftz.f32 %r2, %r1, 0f3F800000;
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %r2;
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fadd_imm_1(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
+; SM90-FTZ-NEXT: mov.b16 %rs2, 0x3F80;
+; SM90-FTZ-NEXT: add.rn.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fadd_imm_1(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<4>;
@@ -750,18 +857,43 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r4;
; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r1;
; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r2;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs8;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs7;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs6;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs5;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs4;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs3;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs2;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs8;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r6, %rs7;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r7, %rs6;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r8, %rs5;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r9, %rs4;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r10, %rs3;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r11, %rs2;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r12, %rs1;
; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9};
; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_extload_bf16x8(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<9>;
+; SM90-FTZ-NEXT: .reg .b32 %r<13>;
+; SM90-FTZ-NEXT: .reg .b64 %rd<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
+; SM90-FTZ-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; SM90-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r3;
+; SM90-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r4;
+; SM90-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; SM90-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r2;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs8;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs7;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs6;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs5;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs4;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs3;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs2;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs1;
+; SM90-FTZ-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9};
+; SM90-FTZ-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_extload_bf16x8(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<9>;
@@ -825,12 +957,24 @@ define i16 @test_fptosi_i16(bfloat %a) {
; SM80-FTZ-EMPTY:
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fptosi_i16_param_0];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1;
; SM80-FTZ-NEXT: cvt.rzi.ftz.s16.f32 %rs2, %r1;
; SM80-FTZ-NEXT: cvt.u32.u16 %r2, %rs2;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r2;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fptosi_i16(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<3>;
+; SM90-FTZ-NEXT: .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fptosi_i16_param_0];
+; SM90-FTZ-NEXT: cvt.rzi.s16.bf16 %rs2, %rs1;
+; SM90-FTZ-NEXT: cvt.u32.u16 %r1, %rs2;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r1;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fptosi_i16(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<3>;
@@ -880,12 +1024,24 @@ define i16 @test_fptoui_i16(bfloat %a) {
; SM80-FTZ-EMPTY:
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fptoui_i16_param_0];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1;
; SM80-FTZ-NEXT: cvt.rzi.ftz.u16.f32 %rs2, %r1;
; SM80-FTZ-NEXT: cvt.u32.u16 %r2, %rs2;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r2;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fptoui_i16(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<3>;
+; SM90-FTZ-NEXT: .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fptoui_i16_param_0];
+; SM90-FTZ-NEXT: cvt.rzi.u16.bf16 %rs2, %rs1;
+; SM90-FTZ-NEXT: cvt.u32.u16 %r1, %rs2;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r1;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fptoui_i16(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<3>;
@@ -945,6 +1101,16 @@ define bfloat @test_sitofp_i16(i16 %a) {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_sitofp_i16(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<3>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_sitofp_i16_param_0];
+; SM90-FTZ-NEXT: cvt.rn.bf16.s16 %rs2, %rs1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_sitofp_i16(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<3>;
@@ -1002,6 +1168,16 @@ define bfloat @test_uitofp_i8(i8 %a) {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_uitofp_i8(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<3>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b8 %rs1, [test_uitofp_i8_param_0];
+; SM90-FTZ-NEXT: cvt.rn.bf16.u16 %rs2, %rs1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_uitofp_i8(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<3>;
@@ -1070,6 +1246,21 @@ define bfloat @test_uitofp_i1(i1 %a) {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_uitofp_i1(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .pred %p<2>;
+; SM90-FTZ-NEXT: .reg .b16 %rs<4>;
+; SM90-FTZ-NEXT: .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b8 %rs1, [test_uitofp_i1_param_0];
+; SM90-FTZ-NEXT: and.b16 %rs2, %rs1, 1;
+; SM90-FTZ-NEXT: setp.ne.b16 %p1, %rs2, 0;
+; SM90-FTZ-NEXT: selp.b32 %r1, 1, 0, %p1;
+; SM90-FTZ-NEXT: cvt.rn.bf16.u32 %rs3, %r1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_uitofp_i1(
; SM90: {
; SM90-NEXT: .reg .pred %p<2>;
@@ -1132,6 +1323,16 @@ define bfloat @test_uitofp_i16(i16 %a) {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_uitofp_i16(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<3>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_uitofp_i16_param_0];
+; SM90-FTZ-NEXT: cvt.rn.bf16.u16 %rs2, %rs1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_uitofp_i16(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<3>;
@@ -1188,6 +1389,17 @@ define bfloat @test_uitofp_i32(i32 %a) {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_uitofp_i32(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<2>;
+; SM90-FTZ-NEXT: .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_uitofp_i32_param_0];
+; SM90-FTZ-NEXT: cvt.rn.bf16.u32 %rs1, %r1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_uitofp_i32(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1248,6 +1460,17 @@ define bfloat @test_uitofp_i64(i64 %a) {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_uitofp_i64(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<2>;
+; SM90-FTZ-NEXT: .reg .b64 %rd<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b64 %rd1, [test_uitofp_i64_param_0];
+; SM90-FTZ-NEXT: cvt.rn.bf16.u64 %rs1, %rd1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_uitofp_i64(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1302,12 +1525,22 @@ define bfloat @test_roundeven(bfloat %a) {
; SM80-FTZ-EMPTY:
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_roundeven_param_0];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1;
; SM80-FTZ-NEXT: cvt.rni.ftz.f32.f32 %r2, %r1;
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %r2;
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_roundeven(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<3>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_roundeven_param_0];
+; SM90-FTZ-NEXT: cvt.rni.bf16.bf16 %rs2, %rs1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_roundeven(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<3>;
@@ -1372,6 +1605,17 @@ define bfloat @test_maximum(bfloat %a, bfloat %b) {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_maximum(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_maximum_param_0];
+; SM90-FTZ-NEXT: ld.param.b16 %rs2, [test_maximum_param_1];
+; SM90-FTZ-NEXT: max.NaN.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_maximum(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<4>;
@@ -1430,6 +1674,17 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_maxnum(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_maxnum_param_0];
+; SM90-FTZ-NEXT: ld.param.b16 %rs2, [test_maxnum_param_1];
+; SM90-FTZ-NEXT: max.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_maxnum(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<4>;
@@ -1511,6 +1766,17 @@ define <2 x bfloat> @test_maximum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_maximum_v2(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_maximum_v2_param_0];
+; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_maximum_v2_param_1];
+; SM90-FTZ-NEXT: max.NaN.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_maximum_v2(
; SM90: {
; SM90-NEXT: .reg .b32 %r<4>;
@@ -1583,6 +1849,17 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_maxnum_v2(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_maxnum_v2_param_0];
+; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_maxnum_v2_param_1];
+; SM90-FTZ-NEXT: max.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_maxnum_v2(
; SM90: {
; SM90-NEXT: .reg .b32 %r<4>;
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll
index d3853e2..4d81fdc 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -mcpu=sm_100a -mtriple=nvptx64 -mattr=+ptx86 %s 2>&1 | FileCheck %s
+; RUN: not llc -mcpu=sm_100a -mtriple=nvptx64 -mattr=+ptx86 %s -o /dev/null 2>&1 | FileCheck %s
; Test that we get a clear error message when using an unsupported syncscope.
diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll b/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll
index 7e2f744..94121f0 100644
--- a/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll
@@ -5,6 +5,12 @@
; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \
; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -mcpu=future -ppc-asm-full-reg-names \
+; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-LE-WACC
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN: -mcpu=future -ppc-asm-full-reg-names \
+; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC
define void @testMultiply(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, ptr nocapture noundef writeonly %c) local_unnamed_addr #0 {
; CHECK-LABEL: testMultiply:
@@ -91,6 +97,91 @@ define void @testMultiply(ptr nocapture noundef readonly %a, ptr nocapture nound
; CHECK-BE-NEXT: ld r30, -16(r1)
; CHECK-BE-NEXT: mtlr r0
; CHECK-BE-NEXT: blr
+;
+; CHECK-LE-WACC-LABEL: testMultiply:
+; CHECK-LE-WACC: # %bb.0: # %entry
+; CHECK-LE-WACC-NEXT: mflr r0
+; CHECK-LE-WACC-NEXT: std r30, -16(r1)
+; CHECK-LE-WACC-NEXT: std r0, 16(r1)
+; CHECK-LE-WACC-NEXT: clrldi r0, r1, 59
+; CHECK-LE-WACC-NEXT: subfic r0, r0, -128
+; CHECK-LE-WACC-NEXT: mr r30, r1
+; CHECK-LE-WACC-NEXT: stdux r1, r1, r0
+; CHECK-LE-WACC-NEXT: stxv v30, -64(r30) # 16-byte Folded Spill
+; CHECK-LE-WACC-NEXT: stxv v31, -48(r30) # 16-byte Folded Spill
+; CHECK-LE-WACC-NEXT: lxv v31, 0(r3)
+; CHECK-LE-WACC-NEXT: lxv v30, 0(r4)
+; CHECK-LE-WACC-NEXT: addi r3, r1, 32
+; CHECK-LE-WACC-NEXT: std r29, -24(r30) # 8-byte Folded Spill
+; CHECK-LE-WACC-NEXT: vmr v2, v31
+; CHECK-LE-WACC-NEXT: vmr v3, v30
+; CHECK-LE-WACC-NEXT: mr r29, r5
+; CHECK-LE-WACC-NEXT: bl _Z15buildVectorPairPu13__vector_pairDv16_hS0_@notoc
+; CHECK-LE-WACC-NEXT: dmxxsetaccz wacc0
+; CHECK-LE-WACC-NEXT: xvf32gerpp wacc0, v31, v30
+; CHECK-LE-WACC-NEXT: lxv vs0, 48(r1)
+; CHECK-LE-WACC-NEXT: lxv vs1, 32(r1)
+; CHECK-LE-WACC-NEXT: xvf32gerpp wacc0, vs1, vs0
+; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp36, vsp34, wacc0, 0
+; CHECK-LE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT: stxv v5, 0(r29)
+; CHECK-LE-WACC-NEXT: pstxv v4, 8(r29), 0
+; CHECK-LE-WACC-NEXT: stxv v3, 16(r29)
+; CHECK-LE-WACC-NEXT: pstxv v2, 24(r29), 0
+; CHECK-LE-WACC-NEXT: lxv v31, -48(r30) # 16-byte Folded Reload
+; CHECK-LE-WACC-NEXT: lxv v30, -64(r30) # 16-byte Folded Reload
+; CHECK-LE-WACC-NEXT: ld r29, -24(r30) # 8-byte Folded Reload
+; CHECK-LE-WACC-NEXT: mr r1, r30
+; CHECK-LE-WACC-NEXT: ld r0, 16(r1)
+; CHECK-LE-WACC-NEXT: ld r30, -16(r1)
+; CHECK-LE-WACC-NEXT: mtlr r0
+; CHECK-LE-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: testMultiply:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: mflr r0
+; CHECK-BE-WACC-NEXT: std r30, -16(r1)
+; CHECK-BE-WACC-NEXT: std r0, 16(r1)
+; CHECK-BE-WACC-NEXT: clrldi r0, r1, 59
+; CHECK-BE-WACC-NEXT: subfic r0, r0, -224
+; CHECK-BE-WACC-NEXT: mr r30, r1
+; CHECK-BE-WACC-NEXT: stdux r1, r1, r0
+; CHECK-BE-WACC-NEXT: stxv v30, -64(r30) # 16-byte Folded Spill
+; CHECK-BE-WACC-NEXT: stxv v31, -48(r30) # 16-byte Folded Spill
+; CHECK-BE-WACC-NEXT: lxv v31, 0(r3)
+; CHECK-BE-WACC-NEXT: lxv v30, 0(r4)
+; CHECK-BE-WACC-NEXT: addi r3, r1, 128
+; CHECK-BE-WACC-NEXT: std r29, -24(r30) # 8-byte Folded Spill
+; CHECK-BE-WACC-NEXT: vmr v2, v31
+; CHECK-BE-WACC-NEXT: vmr v3, v30
+; CHECK-BE-WACC-NEXT: mr r29, r5
+; CHECK-BE-WACC-NEXT: bl _Z15buildVectorPairPu13__vector_pairDv16_hS0_
+; CHECK-BE-WACC-NEXT: nop
+; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT: xvf32gerpp wacc0, v31, v30
+; CHECK-BE-WACC-NEXT: lxv vs0, 128(r1)
+; CHECK-BE-WACC-NEXT: lxv vs1, 144(r1)
+; CHECK-BE-WACC-NEXT: xvf32gerpp wacc0, vs0, vs1
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: vmr v1, v2
+; CHECK-BE-WACC-NEXT: vmr v7, v4
+; CHECK-BE-WACC-NEXT: vmr v0, v3
+; CHECK-BE-WACC-NEXT: vmr v6, v5
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp38, vsp32, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r29)
+; CHECK-BE-WACC-NEXT: pstxv v3, 8(r29), 0
+; CHECK-BE-WACC-NEXT: stxv v4, 16(r29)
+; CHECK-BE-WACC-NEXT: pstxv v5, 24(r29), 0
+; CHECK-BE-WACC-NEXT: lxv v31, -48(r30) # 16-byte Folded Reload
+; CHECK-BE-WACC-NEXT: lxv v30, -64(r30) # 16-byte Folded Reload
+; CHECK-BE-WACC-NEXT: ld r29, -24(r30) # 8-byte Folded Reload
+; CHECK-BE-WACC-NEXT: mr r1, r30
+; CHECK-BE-WACC-NEXT: ld r0, 16(r1)
+; CHECK-BE-WACC-NEXT: ld r30, -16(r1)
+; CHECK-BE-WACC-NEXT: mtlr r0
+; CHECK-BE-WACC-NEXT: blr
entry:
%vP = alloca <256 x i1>, align 32
call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %vP)
diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll b/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll
index 059d60a..bc5d5be 100644
--- a/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll
@@ -3,10 +3,18 @@
; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
; RUN: -disable-auto-paired-vec-st=false < %s | FileCheck %s \
; RUN: --check-prefix=LE-PAIRED
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -mcpu=future -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN: -disable-auto-paired-vec-st=false < %s | FileCheck %s \
+; RUN: --check-prefix=LE-PAIRED-WACC
; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \
; RUN: -ppc-vsr-nums-as-vr -disable-auto-paired-vec-st=false < %s | \
; RUN: FileCheck %s --check-prefix=BE-PAIRED
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN: -mcpu=future -ppc-asm-full-reg-names \
+; RUN: -ppc-vsr-nums-as-vr -disable-auto-paired-vec-st=false < %s | \
+; RUN: FileCheck %s --check-prefix=BE-PAIRED-WACC
; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-vsr-nums-as-vr \
; RUN: -ppc-asm-full-reg-names -mtriple=powerpc64le-unknown-linux-gnu < %s \
; RUN: | FileCheck %s --check-prefix=LE-PWR9
@@ -36,6 +44,20 @@ define dso_local void @testLdSt(i64 %SrcIdx, i64 %DstIdx) {
; LE-PAIRED-NEXT: pstxv vs3, f@PCREL+128(0), 1
; LE-PAIRED-NEXT: blr
;
+; LE-PAIRED-WACC-LABEL: testLdSt:
+; LE-PAIRED-WACC: # %bb.0: # %entry
+; LE-PAIRED-WACC-NEXT: plxv v3, f@PCREL+64(0), 1
+; LE-PAIRED-WACC-NEXT: plxv v5, f@PCREL+96(0), 1
+; LE-PAIRED-WACC-NEXT: plxv v2, f@PCREL+80(0), 1
+; LE-PAIRED-WACC-NEXT: plxv v4, f@PCREL+112(0), 1
+; LE-PAIRED-WACC-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; LE-PAIRED-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; LE-PAIRED-WACC-NEXT: pstxv v4, f@PCREL+176(0), 1
+; LE-PAIRED-WACC-NEXT: pstxv v5, f@PCREL+160(0), 1
+; LE-PAIRED-WACC-NEXT: pstxv v2, f@PCREL+144(0), 1
+; LE-PAIRED-WACC-NEXT: pstxv v3, f@PCREL+128(0), 1
+; LE-PAIRED-WACC-NEXT: blr
+;
; BE-PAIRED-LABEL: testLdSt:
; BE-PAIRED: # %bb.0: # %entry
; BE-PAIRED-NEXT: addis r3, r2, f@toc@ha
@@ -50,6 +72,22 @@ define dso_local void @testLdSt(i64 %SrcIdx, i64 %DstIdx) {
; BE-PAIRED-NEXT: stxv vs2, 160(r3)
; BE-PAIRED-NEXT: blr
;
+; BE-PAIRED-WACC-LABEL: testLdSt:
+; BE-PAIRED-WACC: # %bb.0: # %entry
+; BE-PAIRED-WACC-NEXT: addis r3, r2, f@toc@ha
+; BE-PAIRED-WACC-NEXT: addi r3, r3, f@toc@l
+; BE-PAIRED-WACC-NEXT: lxv v3, 112(r3)
+; BE-PAIRED-WACC-NEXT: lxv v5, 80(r3)
+; BE-PAIRED-WACC-NEXT: lxv v2, 96(r3)
+; BE-PAIRED-WACC-NEXT: lxv v4, 64(r3)
+; BE-PAIRED-WACC-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; BE-PAIRED-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; BE-PAIRED-WACC-NEXT: stxv v5, 176(r3)
+; BE-PAIRED-WACC-NEXT: stxv v4, 160(r3)
+; BE-PAIRED-WACC-NEXT: stxv v3, 144(r3)
+; BE-PAIRED-WACC-NEXT: stxv v2, 128(r3)
+; BE-PAIRED-WACC-NEXT: blr
+;
; LE-PWR9-LABEL: testLdSt:
; LE-PWR9: # %bb.0: # %entry
; LE-PWR9-NEXT: addis r3, r2, f@toc@ha
@@ -147,6 +185,25 @@ define dso_local void @testXLdSt(i64 %SrcIdx, i64 %DstIdx) {
; LE-PAIRED-NEXT: stxv vs2, 16(r4)
; LE-PAIRED-NEXT: blr
;
+; LE-PAIRED-WACC-LABEL: testXLdSt:
+; LE-PAIRED-WACC: # %bb.0: # %entry
+; LE-PAIRED-WACC-NEXT: paddi r5, 0, f@PCREL, 1
+; LE-PAIRED-WACC-NEXT: sldi r3, r3, 6
+; LE-PAIRED-WACC-NEXT: add r6, r5, r3
+; LE-PAIRED-WACC-NEXT: lxvx v3, r5, r3
+; LE-PAIRED-WACC-NEXT: lxv v2, 16(r6)
+; LE-PAIRED-WACC-NEXT: lxv v5, 32(r6)
+; LE-PAIRED-WACC-NEXT: lxv v4, 48(r6)
+; LE-PAIRED-WACC-NEXT: sldi r3, r4, 6
+; LE-PAIRED-WACC-NEXT: add r4, r5, r3
+; LE-PAIRED-WACC-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; LE-PAIRED-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; LE-PAIRED-WACC-NEXT: stxvx v3, r5, r3
+; LE-PAIRED-WACC-NEXT: stxv v4, 48(r4)
+; LE-PAIRED-WACC-NEXT: stxv v5, 32(r4)
+; LE-PAIRED-WACC-NEXT: stxv v2, 16(r4)
+; LE-PAIRED-WACC-NEXT: blr
+;
; BE-PAIRED-LABEL: testXLdSt:
; BE-PAIRED: # %bb.0: # %entry
; BE-PAIRED-NEXT: addis r5, r2, f@toc@ha
@@ -165,6 +222,26 @@ define dso_local void @testXLdSt(i64 %SrcIdx, i64 %DstIdx) {
; BE-PAIRED-NEXT: stxv vs2, 32(r4)
; BE-PAIRED-NEXT: blr
;
+; BE-PAIRED-WACC-LABEL: testXLdSt:
+; BE-PAIRED-WACC: # %bb.0: # %entry
+; BE-PAIRED-WACC-NEXT: addis r5, r2, f@toc@ha
+; BE-PAIRED-WACC-NEXT: addi r5, r5, f@toc@l
+; BE-PAIRED-WACC-NEXT: sldi r3, r3, 6
+; BE-PAIRED-WACC-NEXT: add r6, r5, r3
+; BE-PAIRED-WACC-NEXT: lxvx v2, r5, r3
+; BE-PAIRED-WACC-NEXT: lxv v5, 48(r6)
+; BE-PAIRED-WACC-NEXT: lxv v3, 16(r6)
+; BE-PAIRED-WACC-NEXT: lxv v4, 32(r6)
+; BE-PAIRED-WACC-NEXT: sldi r3, r4, 6
+; BE-PAIRED-WACC-NEXT: add r4, r5, r3
+; BE-PAIRED-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; BE-PAIRED-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; BE-PAIRED-WACC-NEXT: stxvx v2, r5, r3
+; BE-PAIRED-WACC-NEXT: stxv v5, 48(r4)
+; BE-PAIRED-WACC-NEXT: stxv v4, 32(r4)
+; BE-PAIRED-WACC-NEXT: stxv v3, 16(r4)
+; BE-PAIRED-WACC-NEXT: blr
+;
; LE-PWR9-LABEL: testXLdSt:
; LE-PWR9: # %bb.0: # %entry
; LE-PWR9-NEXT: addis r5, r2, f@toc@ha
@@ -263,6 +340,20 @@ define dso_local void @testUnalignedLdSt() {
; LE-PAIRED-NEXT: pstxv vs3, f@PCREL+19(0), 1
; LE-PAIRED-NEXT: blr
;
+; LE-PAIRED-WACC-LABEL: testUnalignedLdSt:
+; LE-PAIRED-WACC: # %bb.0: # %entry
+; LE-PAIRED-WACC-NEXT: plxv v3, f@PCREL+11(0), 1
+; LE-PAIRED-WACC-NEXT: plxv v5, f@PCREL+43(0), 1
+; LE-PAIRED-WACC-NEXT: plxv v2, f@PCREL+27(0), 1
+; LE-PAIRED-WACC-NEXT: plxv v4, f@PCREL+59(0), 1
+; LE-PAIRED-WACC-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; LE-PAIRED-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; LE-PAIRED-WACC-NEXT: pstxv v4, f@PCREL+67(0), 1
+; LE-PAIRED-WACC-NEXT: pstxv v5, f@PCREL+51(0), 1
+; LE-PAIRED-WACC-NEXT: pstxv v2, f@PCREL+35(0), 1
+; LE-PAIRED-WACC-NEXT: pstxv v3, f@PCREL+19(0), 1
+; LE-PAIRED-WACC-NEXT: blr
+;
; BE-PAIRED-LABEL: testUnalignedLdSt:
; BE-PAIRED: # %bb.0: # %entry
; BE-PAIRED-NEXT: addis r3, r2, f@toc@ha
@@ -277,6 +368,22 @@ define dso_local void @testUnalignedLdSt() {
; BE-PAIRED-NEXT: pstxv vs2, 51(r3), 0
; BE-PAIRED-NEXT: blr
;
+; BE-PAIRED-WACC-LABEL: testUnalignedLdSt:
+; BE-PAIRED-WACC: # %bb.0: # %entry
+; BE-PAIRED-WACC-NEXT: addis r3, r2, f@toc@ha
+; BE-PAIRED-WACC-NEXT: addi r3, r3, f@toc@l
+; BE-PAIRED-WACC-NEXT: plxv v3, 59(r3), 0
+; BE-PAIRED-WACC-NEXT: plxv v5, 27(r3), 0
+; BE-PAIRED-WACC-NEXT: plxv v2, 43(r3), 0
+; BE-PAIRED-WACC-NEXT: plxv v4, 11(r3), 0
+; BE-PAIRED-WACC-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; BE-PAIRED-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; BE-PAIRED-WACC-NEXT: pstxv v5, 67(r3), 0
+; BE-PAIRED-WACC-NEXT: pstxv v4, 51(r3), 0
+; BE-PAIRED-WACC-NEXT: pstxv v3, 35(r3), 0
+; BE-PAIRED-WACC-NEXT: pstxv v2, 19(r3), 0
+; BE-PAIRED-WACC-NEXT: blr
+;
; LE-PWR9-LABEL: testUnalignedLdSt:
; LE-PWR9: # %bb.0: # %entry
; LE-PWR9-NEXT: addis r3, r2, f@toc@ha
@@ -381,6 +488,14 @@ define dso_local void @testLdStPair(i64 %SrcIdx, i64 %DstIdx) {
; LE-PAIRED-NEXT: pstxv vs1, g@PCREL+64(0), 1
; LE-PAIRED-NEXT: blr
;
+; LE-PAIRED-WACC-LABEL: testLdStPair:
+; LE-PAIRED-WACC: # %bb.0: # %entry
+; LE-PAIRED-WACC-NEXT: plxv vs0, g@PCREL+48(0), 1
+; LE-PAIRED-WACC-NEXT: plxv vs1, g@PCREL+32(0), 1
+; LE-PAIRED-WACC-NEXT: pstxv vs0, g@PCREL+80(0), 1
+; LE-PAIRED-WACC-NEXT: pstxv vs1, g@PCREL+64(0), 1
+; LE-PAIRED-WACC-NEXT: blr
+;
; BE-PAIRED-LABEL: testLdStPair:
; BE-PAIRED: # %bb.0: # %entry
; BE-PAIRED-NEXT: addis r3, r2, g@toc@ha
@@ -391,6 +506,16 @@ define dso_local void @testLdStPair(i64 %SrcIdx, i64 %DstIdx) {
; BE-PAIRED-NEXT: stxv vs0, 64(r3)
; BE-PAIRED-NEXT: blr
;
+; BE-PAIRED-WACC-LABEL: testLdStPair:
+; BE-PAIRED-WACC: # %bb.0: # %entry
+; BE-PAIRED-WACC-NEXT: addis r3, r2, g@toc@ha
+; BE-PAIRED-WACC-NEXT: addi r3, r3, g@toc@l
+; BE-PAIRED-WACC-NEXT: lxv vs0, 48(r3)
+; BE-PAIRED-WACC-NEXT: lxv vs1, 32(r3)
+; BE-PAIRED-WACC-NEXT: stxv vs0, 80(r3)
+; BE-PAIRED-WACC-NEXT: stxv vs1, 64(r3)
+; BE-PAIRED-WACC-NEXT: blr
+;
; LE-PWR9-LABEL: testLdStPair:
; LE-PWR9: # %bb.0: # %entry
; LE-PWR9-NEXT: addis r3, r2, g@toc@ha
@@ -460,6 +585,19 @@ define dso_local void @testXLdStPair(i64 %SrcIdx, i64 %DstIdx) {
; LE-PAIRED-NEXT: stxv vs1, 16(r4)
; LE-PAIRED-NEXT: blr
;
+; LE-PAIRED-WACC-LABEL: testXLdStPair:
+; LE-PAIRED-WACC: # %bb.0: # %entry
+; LE-PAIRED-WACC-NEXT: sldi r3, r3, 5
+; LE-PAIRED-WACC-NEXT: paddi r5, 0, g@PCREL, 1
+; LE-PAIRED-WACC-NEXT: add r6, r5, r3
+; LE-PAIRED-WACC-NEXT: lxvx vs0, r5, r3
+; LE-PAIRED-WACC-NEXT: lxv vs1, 16(r6)
+; LE-PAIRED-WACC-NEXT: sldi r3, r4, 5
+; LE-PAIRED-WACC-NEXT: add r4, r5, r3
+; LE-PAIRED-WACC-NEXT: stxvx vs0, r5, r3
+; LE-PAIRED-WACC-NEXT: stxv vs1, 16(r4)
+; LE-PAIRED-WACC-NEXT: blr
+;
; BE-PAIRED-LABEL: testXLdStPair:
; BE-PAIRED: # %bb.0: # %entry
; BE-PAIRED-NEXT: addis r5, r2, g@toc@ha
@@ -474,6 +612,20 @@ define dso_local void @testXLdStPair(i64 %SrcIdx, i64 %DstIdx) {
; BE-PAIRED-NEXT: stxv vs1, 16(r4)
; BE-PAIRED-NEXT: blr
;
+; BE-PAIRED-WACC-LABEL: testXLdStPair:
+; BE-PAIRED-WACC: # %bb.0: # %entry
+; BE-PAIRED-WACC-NEXT: addis r5, r2, g@toc@ha
+; BE-PAIRED-WACC-NEXT: sldi r3, r3, 5
+; BE-PAIRED-WACC-NEXT: addi r5, r5, g@toc@l
+; BE-PAIRED-WACC-NEXT: add r6, r5, r3
+; BE-PAIRED-WACC-NEXT: lxvx vs0, r5, r3
+; BE-PAIRED-WACC-NEXT: lxv vs1, 16(r6)
+; BE-PAIRED-WACC-NEXT: sldi r3, r4, 5
+; BE-PAIRED-WACC-NEXT: add r4, r5, r3
+; BE-PAIRED-WACC-NEXT: stxvx vs0, r5, r3
+; BE-PAIRED-WACC-NEXT: stxv vs1, 16(r4)
+; BE-PAIRED-WACC-NEXT: blr
+;
; LE-PWR9-LABEL: testXLdStPair:
; LE-PWR9: # %bb.0: # %entry
; LE-PWR9-NEXT: addis r5, r2, g@toc@ha
@@ -548,6 +700,14 @@ define dso_local void @testUnalignedLdStPair() {
; LE-PAIRED-NEXT: pstxv vs1, g@PCREL+19(0), 1
; LE-PAIRED-NEXT: blr
;
+; LE-PAIRED-WACC-LABEL: testUnalignedLdStPair:
+; LE-PAIRED-WACC: # %bb.0: # %entry
+; LE-PAIRED-WACC-NEXT: plxv vs0, g@PCREL+27(0), 1
+; LE-PAIRED-WACC-NEXT: plxv vs1, g@PCREL+11(0), 1
+; LE-PAIRED-WACC-NEXT: pstxv vs0, g@PCREL+35(0), 1
+; LE-PAIRED-WACC-NEXT: pstxv vs1, g@PCREL+19(0), 1
+; LE-PAIRED-WACC-NEXT: blr
+;
; BE-PAIRED-LABEL: testUnalignedLdStPair:
; BE-PAIRED: # %bb.0: # %entry
; BE-PAIRED-NEXT: addis r3, r2, g@toc@ha
@@ -558,6 +718,16 @@ define dso_local void @testUnalignedLdStPair() {
; BE-PAIRED-NEXT: pstxv vs0, 19(r3), 0
; BE-PAIRED-NEXT: blr
;
+; BE-PAIRED-WACC-LABEL: testUnalignedLdStPair:
+; BE-PAIRED-WACC: # %bb.0: # %entry
+; BE-PAIRED-WACC-NEXT: addis r3, r2, g@toc@ha
+; BE-PAIRED-WACC-NEXT: addi r3, r3, g@toc@l
+; BE-PAIRED-WACC-NEXT: plxv vs0, 27(r3), 0
+; BE-PAIRED-WACC-NEXT: plxv vs1, 11(r3), 0
+; BE-PAIRED-WACC-NEXT: pstxv vs0, 35(r3), 0
+; BE-PAIRED-WACC-NEXT: pstxv vs1, 19(r3), 0
+; BE-PAIRED-WACC-NEXT: blr
+;
; LE-PWR9-LABEL: testUnalignedLdStPair:
; LE-PWR9: # %bb.0: # %entry
; LE-PWR9-NEXT: addis r3, r2, g@toc@ha
diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll b/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll
index abc65be..9db8ba1 100644
--- a/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll
@@ -13,6 +13,13 @@
; RUN: -mcpu=pwr11 -ppc-asm-full-reg-names -disable-auto-paired-vec-st=false \
; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -mcpu=future -ppc-asm-full-reg-names -disable-auto-paired-vec-st=false \
+; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-LE-WACC
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN: -mcpu=future -ppc-asm-full-reg-names -disable-auto-paired-vec-st=false \
+; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC
+
declare <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1>, <16 x i8>, <16 x i8>)
declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
declare void @foo()
@@ -119,6 +126,101 @@ define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i
; CHECK-BE-NEXT: ld r0, 16(r1)
; CHECK-BE-NEXT: mtlr r0
; CHECK-BE-NEXT: blr
+;
+; CHECK-LE-WACC-LABEL: intrinsics1:
+; CHECK-LE-WACC: # %bb.0:
+; CHECK-LE-WACC-NEXT: mflr r0
+; CHECK-LE-WACC-NEXT: std r0, 16(r1)
+; CHECK-LE-WACC-NEXT: stdu r1, -176(r1)
+; CHECK-LE-WACC-NEXT: .cfi_def_cfa_offset 176
+; CHECK-LE-WACC-NEXT: .cfi_offset lr, 16
+; CHECK-LE-WACC-NEXT: .cfi_offset r30, -16
+; CHECK-LE-WACC-NEXT: .cfi_offset v28, -80
+; CHECK-LE-WACC-NEXT: .cfi_offset v29, -64
+; CHECK-LE-WACC-NEXT: .cfi_offset v30, -48
+; CHECK-LE-WACC-NEXT: .cfi_offset v31, -32
+; CHECK-LE-WACC-NEXT: stxv v28, 96(r1) # 16-byte Folded Spill
+; CHECK-LE-WACC-NEXT: stxv v29, 112(r1) # 16-byte Folded Spill
+; CHECK-LE-WACC-NEXT: stxv v30, 128(r1) # 16-byte Folded Spill
+; CHECK-LE-WACC-NEXT: stxv v31, 144(r1) # 16-byte Folded Spill
+; CHECK-LE-WACC-NEXT: vmr v31, v5
+; CHECK-LE-WACC-NEXT: vmr v29, v3
+; CHECK-LE-WACC-NEXT: vmr v30, v4
+; CHECK-LE-WACC-NEXT: vmr v28, v2
+; CHECK-LE-WACC-NEXT: std r30, 160(r1) # 8-byte Folded Spill
+; CHECK-LE-WACC-NEXT: ld r30, 272(r1)
+; CHECK-LE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp60, vsp62, 0
+; CHECK-LE-WACC-NEXT: xvf16ger2pp wacc0, v2, v4
+; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp36, vsp34, wacc0, 0
+; CHECK-LE-WACC-NEXT: stxvp vsp36, 64(r1)
+; CHECK-LE-WACC-NEXT: stxvp vsp34, 32(r1)
+; CHECK-LE-WACC-NEXT: bl foo@notoc
+; CHECK-LE-WACC-NEXT: lxvp vsp34, 64(r1)
+; CHECK-LE-WACC-NEXT: lxvp vsp36, 32(r1)
+; CHECK-LE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-LE-WACC-NEXT: xvf16ger2pp wacc0, v28, v30
+; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT: stxv v4, 48(r30)
+; CHECK-LE-WACC-NEXT: stxv v5, 32(r30)
+; CHECK-LE-WACC-NEXT: stxv v2, 16(r30)
+; CHECK-LE-WACC-NEXT: stxv v3, 0(r30)
+; CHECK-LE-WACC-NEXT: lxv v31, 144(r1) # 16-byte Folded Reload
+; CHECK-LE-WACC-NEXT: lxv v30, 128(r1) # 16-byte Folded Reload
+; CHECK-LE-WACC-NEXT: lxv v29, 112(r1) # 16-byte Folded Reload
+; CHECK-LE-WACC-NEXT: lxv v28, 96(r1) # 16-byte Folded Reload
+; CHECK-LE-WACC-NEXT: ld r30, 160(r1) # 8-byte Folded Reload
+; CHECK-LE-WACC-NEXT: addi r1, r1, 176
+; CHECK-LE-WACC-NEXT: ld r0, 16(r1)
+; CHECK-LE-WACC-NEXT: mtlr r0
+; CHECK-LE-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: intrinsics1:
+; CHECK-BE-WACC: # %bb.0:
+; CHECK-BE-WACC-NEXT: mflr r0
+; CHECK-BE-WACC-NEXT: std r0, 16(r1)
+; CHECK-BE-WACC-NEXT: stdu r1, -256(r1)
+; CHECK-BE-WACC-NEXT: .cfi_def_cfa_offset 256
+; CHECK-BE-WACC-NEXT: .cfi_offset lr, 16
+; CHECK-BE-WACC-NEXT: .cfi_offset r30, -16
+; CHECK-BE-WACC-NEXT: .cfi_offset v28, -80
+; CHECK-BE-WACC-NEXT: .cfi_offset v29, -64
+; CHECK-BE-WACC-NEXT: .cfi_offset v30, -48
+; CHECK-BE-WACC-NEXT: .cfi_offset v31, -32
+; CHECK-BE-WACC-NEXT: stxv v28, 176(r1) # 16-byte Folded Spill
+; CHECK-BE-WACC-NEXT: stxv v29, 192(r1) # 16-byte Folded Spill
+; CHECK-BE-WACC-NEXT: stxv v30, 208(r1) # 16-byte Folded Spill
+; CHECK-BE-WACC-NEXT: stxv v31, 224(r1) # 16-byte Folded Spill
+; CHECK-BE-WACC-NEXT: vmr v31, v5
+; CHECK-BE-WACC-NEXT: vmr v29, v3
+; CHECK-BE-WACC-NEXT: vmr v30, v4
+; CHECK-BE-WACC-NEXT: vmr v28, v2
+; CHECK-BE-WACC-NEXT: std r30, 240(r1) # 8-byte Folded Spill
+; CHECK-BE-WACC-NEXT: ld r30, 368(r1)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp60, vsp62, 0
+; CHECK-BE-WACC-NEXT: xvf16ger2pp wacc0, v2, v4
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp36, vsp34, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxvp vsp36, 112(r1)
+; CHECK-BE-WACC-NEXT: stxvp vsp34, 144(r1)
+; CHECK-BE-WACC-NEXT: bl foo
+; CHECK-BE-WACC-NEXT: nop
+; CHECK-BE-WACC-NEXT: lxvp vsp34, 112(r1)
+; CHECK-BE-WACC-NEXT: lxvp vsp36, 144(r1)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-BE-WACC-NEXT: xvf16ger2pp wacc0, v28, v30
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r30)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r30)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r30)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r30)
+; CHECK-BE-WACC-NEXT: lxv v31, 224(r1) # 16-byte Folded Reload
+; CHECK-BE-WACC-NEXT: lxv v30, 208(r1) # 16-byte Folded Reload
+; CHECK-BE-WACC-NEXT: lxv v29, 192(r1) # 16-byte Folded Reload
+; CHECK-BE-WACC-NEXT: lxv v28, 176(r1) # 16-byte Folded Reload
+; CHECK-BE-WACC-NEXT: ld r30, 240(r1) # 8-byte Folded Reload
+; CHECK-BE-WACC-NEXT: addi r1, r1, 256
+; CHECK-BE-WACC-NEXT: ld r0, 16(r1)
+; CHECK-BE-WACC-NEXT: mtlr r0
+; CHECK-BE-WACC-NEXT: blr
%1 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i8> %vc4)
%2 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %1, <16 x i8> %vc1, <16 x i8> %vc3)
tail call void @foo()
diff --git a/llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll b/llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll
index e932aec..7b36fa4 100644
--- a/llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll
@@ -5,6 +5,12 @@
; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \
; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -mcpu=future -ppc-asm-full-reg-names \
+; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-LE-WACC
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN: -mcpu=future -ppc-asm-full-reg-names \
+; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC
; Function Attrs: nofree nounwind writeonly
define dso_local void @test1(ptr nocapture readnone %vqp, ptr nocapture readnone %vpp, <16 x i8> %vc, ptr nocapture %resp) {
@@ -27,6 +33,26 @@ define dso_local void @test1(ptr nocapture readnone %vqp, ptr nocapture readnone
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-LE-WACC-LABEL: test1:
+; CHECK-LE-WACC: # %bb.0: # %entry
+; CHECK-LE-WACC-NEXT: xvi16ger2 wacc0, v2, v2
+; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-LE-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-LE-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-LE-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-LE-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test1:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: xvi16ger2 wacc0, v2, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2(<16 x i8> %vc, <16 x i8> %vc)
store <512 x i1> %0, ptr %resp, align 64
@@ -57,6 +83,26 @@ define dso_local void @test2(ptr nocapture readnone %vqp, ptr nocapture readnone
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-LE-WACC-LABEL: test2:
+; CHECK-LE-WACC: # %bb.0: # %entry
+; CHECK-LE-WACC-NEXT: pmxvi16ger2 wacc0, v2, v2, 0, 0, 0
+; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-LE-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-LE-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-LE-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-LE-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test2:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: pmxvi16ger2 wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
store <512 x i1> %0, ptr %resp, align 64
@@ -97,6 +143,36 @@ define dso_local void @test3(ptr nocapture readonly %vqp, ptr nocapture readnone
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-LE-WACC-LABEL: test3:
+; CHECK-LE-WACC: # %bb.0: # %entry
+; CHECK-LE-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-LE-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-LE-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-LE-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-LE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-LE-WACC-NEXT: xvi8ger4spp wacc0, v2, v2
+; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-LE-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-LE-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-LE-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-LE-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test3:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: xvi8ger4spp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4spp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -138,6 +214,36 @@ define dso_local void @test4(ptr nocapture readonly %vqp, ptr nocapture readnone
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-LE-WACC-LABEL: test4:
+; CHECK-LE-WACC: # %bb.0: # %entry
+; CHECK-LE-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-LE-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-LE-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-LE-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-LE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-LE-WACC-NEXT: xvi16ger2pp wacc0, v2, v2
+; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-LE-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-LE-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-LE-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-LE-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test4:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: xvi16ger2pp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -179,6 +285,36 @@ define dso_local void @test5(ptr nocapture readonly %vqp, ptr nocapture readnone
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-LE-WACC-LABEL: test5:
+; CHECK-LE-WACC: # %bb.0: # %entry
+; CHECK-LE-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-LE-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-LE-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-LE-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-LE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-LE-WACC-NEXT: pmxvi8ger4spp wacc0, v2, v2, 0, 0, 0
+; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-LE-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-LE-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-LE-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-LE-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test5:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: pmxvi8ger4spp wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4spp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -220,6 +356,36 @@ define dso_local void @test6(ptr nocapture readonly %vqp, ptr nocapture readnone
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-LE-WACC-LABEL: test6:
+; CHECK-LE-WACC: # %bb.0: # %entry
+; CHECK-LE-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-LE-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-LE-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-LE-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-LE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-LE-WACC-NEXT: pmxvi16ger2pp wacc0, v2, v2, 0, 0, 0
+; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-LE-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-LE-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-LE-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-LE-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test6:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: pmxvi16ger2pp wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
index 8fbc9d7..3505cbb 100644
--- a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
@@ -5,6 +5,12 @@
; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \
; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -mcpu=future -ppc-asm-full-reg-names \
+; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-WACC
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN: -mcpu=future -ppc-asm-full-reg-names \
+; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC
; assemble_acc
declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
@@ -32,6 +38,28 @@ define void @ass_acc(ptr %ptr, <16 x i8> %vc) {
; CHECK-BE-NEXT: stxv vs3, 48(r3)
; CHECK-BE-NEXT: stxv vs2, 32(r3)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: ass_acc:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: vmr v3, v2
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r3)
+; CHECK-WACC-NEXT: stxv v5, 32(r3)
+; CHECK-WACC-NEXT: stxv v2, 16(r3)
+; CHECK-WACC-NEXT: stxv v3, 0(r3)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: ass_acc:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: vmr v3, v2
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc)
store <512 x i1> %0, ptr %ptr, align 64
@@ -66,6 +94,28 @@ define void @int_xxmtacc(ptr %ptr, <16 x i8> %vc) {
; CHECK-BE-NEXT: stxv vs3, 48(r3)
; CHECK-BE-NEXT: stxv vs2, 32(r3)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: int_xxmtacc:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: vmr v3, v2
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r3)
+; CHECK-WACC-NEXT: stxv v5, 32(r3)
+; CHECK-WACC-NEXT: stxv v2, 16(r3)
+; CHECK-WACC-NEXT: stxv v3, 0(r3)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: int_xxmtacc:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: vmr v3, v2
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT: blr
entry:
; One xxmtacc is generated from the call to assemble.acc then one xxmtacc is
; generated from the call to xxmtacc then one xxmfacc is generated for the store
@@ -101,6 +151,28 @@ define void @int_xxmfacc(ptr %ptr, <16 x i8> %vc) {
; CHECK-BE-NEXT: stxv vs3, 48(r3)
; CHECK-BE-NEXT: stxv vs2, 32(r3)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: int_xxmfacc:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: vmr v3, v2
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r3)
+; CHECK-WACC-NEXT: stxv v5, 32(r3)
+; CHECK-WACC-NEXT: stxv v2, 16(r3)
+; CHECK-WACC-NEXT: stxv v3, 0(r3)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: int_xxmfacc:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: vmr v3, v2
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT: blr
entry:
; One xxmtacc is generated from the call to assemble.acc then one xxmfacc is
; generated from the call to xxmfacc then one xxmfacc is generated for the store
@@ -132,6 +204,26 @@ define void @int_xxsetaccz(ptr %ptr) {
; CHECK-BE-NEXT: stxv vs3, 48(r3)
; CHECK-BE-NEXT: stxv vs2, 32(r3)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: int_xxsetaccz:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: dmxxsetaccz wacc0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r3)
+; CHECK-WACC-NEXT: stxv v5, 32(r3)
+; CHECK-WACC-NEXT: stxv v2, 16(r3)
+; CHECK-WACC-NEXT: stxv v3, 0(r3)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: int_xxsetaccz:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
store <512 x i1> %0, ptr %ptr, align 64
@@ -160,6 +252,26 @@ define void @disass_acc(ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr %ptr4) {
; CHECK-BE-NEXT: stxv vs2, 0(r5)
; CHECK-BE-NEXT: stxv vs3, 0(r6)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: disass_acc:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: dmxxsetaccz wacc0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v5, 0(r3)
+; CHECK-WACC-NEXT: stxv v4, 0(r4)
+; CHECK-WACC-NEXT: stxv v3, 0(r5)
+; CHECK-WACC-NEXT: stxv v2, 0(r6)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: disass_acc:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT: stxv v3, 0(r4)
+; CHECK-BE-WACC-NEXT: stxv v4, 0(r5)
+; CHECK-BE-WACC-NEXT: stxv v5, 0(r6)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
%1 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %0)
@@ -219,6 +331,50 @@ define void @testBranch(ptr %ptr, <16 x i8> %vc, i32 %val) {
; CHECK-BE-NEXT: stxv vs3, 48(r3)
; CHECK-BE-NEXT: stxv vs2, 32(r3)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: testBranch:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: cmplwi r7, 0
+; CHECK-WACC-NEXT: beq cr0, .LBB5_2
+; CHECK-WACC-NEXT: # %bb.1: # %if.then
+; CHECK-WACC-NEXT: dmxxsetaccz wacc0
+; CHECK-WACC-NEXT: b .LBB5_3
+; CHECK-WACC-NEXT: .LBB5_2: # %if.else
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: xvi4ger8pp wacc0, v2, v2
+; CHECK-WACC-NEXT: .LBB5_3: # %if.end
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r3)
+; CHECK-WACC-NEXT: stxv v5, 32(r3)
+; CHECK-WACC-NEXT: stxv v2, 16(r3)
+; CHECK-WACC-NEXT: stxv v3, 0(r3)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: testBranch:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: cmplwi r7, 0
+; CHECK-BE-WACC-NEXT: beq cr0, .LBB5_2
+; CHECK-BE-WACC-NEXT: # %bb.1: # %if.then
+; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT: b .LBB5_3
+; CHECK-BE-WACC-NEXT: .LBB5_2: # %if.else
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: xvi4ger8pp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT: .LBB5_3: # %if.end
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT: blr
entry:
%tobool = icmp eq i32 %val, 0
br i1 %tobool, label %if.else, label %if.then
@@ -273,6 +429,36 @@ define void @testcse(ptr %res, <16 x i8> %vc) {
; CHECK-BE-NEXT: stxv vs3, 112(r3)
; CHECK-BE-NEXT: stxv vs2, 96(r3)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: testcse:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: dmxxsetaccz wacc0
+; CHECK-WACC-NEXT: xvf32gerpp wacc0, v2, v2
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r3)
+; CHECK-WACC-NEXT: stxv v5, 32(r3)
+; CHECK-WACC-NEXT: stxv v2, 16(r3)
+; CHECK-WACC-NEXT: stxv v3, 0(r3)
+; CHECK-WACC-NEXT: stxv v4, 112(r3)
+; CHECK-WACC-NEXT: stxv v5, 96(r3)
+; CHECK-WACC-NEXT: stxv v2, 80(r3)
+; CHECK-WACC-NEXT: stxv v3, 64(r3)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: testcse:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT: xvf32gerpp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT: stxv v5, 112(r3)
+; CHECK-BE-WACC-NEXT: stxv v4, 96(r3)
+; CHECK-BE-WACC-NEXT: stxv v3, 80(r3)
+; CHECK-BE-WACC-NEXT: stxv v2, 64(r3)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
%1 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
@@ -320,6 +506,42 @@ define void @testcse2(ptr %res, <16 x i8> %vc) {
; CHECK-BE-NEXT: stxv vs3, 112(r3)
; CHECK-BE-NEXT: stxv vs2, 96(r3)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: testcse2:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: dmxxsetaccz wacc1
+; CHECK-WACC-NEXT: dmxxsetaccz wacc0
+; CHECK-WACC-NEXT: xvf32gerpp wacc1, v2, v2
+; CHECK-WACC-NEXT: xvf32gerpn wacc0, v2, v2
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc1, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r3)
+; CHECK-WACC-NEXT: stxv v5, 32(r3)
+; CHECK-WACC-NEXT: stxv v2, 16(r3)
+; CHECK-WACC-NEXT: stxv v3, 0(r3)
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 112(r3)
+; CHECK-WACC-NEXT: stxv v5, 96(r3)
+; CHECK-WACC-NEXT: stxv v2, 80(r3)
+; CHECK-WACC-NEXT: stxv v3, 64(r3)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: testcse2:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc1
+; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT: xvf32gerpp wacc1, v2, v2
+; CHECK-BE-WACC-NEXT: xvf32gerpn wacc0, v2, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc1, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 112(r3)
+; CHECK-BE-WACC-NEXT: stxv v4, 96(r3)
+; CHECK-BE-WACC-NEXT: stxv v3, 80(r3)
+; CHECK-BE-WACC-NEXT: stxv v2, 64(r3)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
%1 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
@@ -367,6 +589,42 @@ define void @testcse3(ptr %res, <16 x i8> %vc) {
; CHECK-BE-NEXT: stxv vs3, 112(r3)
; CHECK-BE-NEXT: stxv vs2, 96(r3)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: testcse3:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: dmxxsetaccz wacc1
+; CHECK-WACC-NEXT: dmxxsetaccz wacc0
+; CHECK-WACC-NEXT: xvf32gerpp wacc1, v2, v2
+; CHECK-WACC-NEXT: xvf32gerpn wacc0, v2, v2
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc1, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r3)
+; CHECK-WACC-NEXT: stxv v5, 32(r3)
+; CHECK-WACC-NEXT: stxv v2, 16(r3)
+; CHECK-WACC-NEXT: stxv v3, 0(r3)
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 112(r3)
+; CHECK-WACC-NEXT: stxv v5, 96(r3)
+; CHECK-WACC-NEXT: stxv v2, 80(r3)
+; CHECK-WACC-NEXT: stxv v3, 64(r3)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: testcse3:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc1
+; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT: xvf32gerpp wacc1, v2, v2
+; CHECK-BE-WACC-NEXT: xvf32gerpn wacc0, v2, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc1, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 112(r3)
+; CHECK-BE-WACC-NEXT: stxv v4, 96(r3)
+; CHECK-BE-WACC-NEXT: stxv v3, 80(r3)
+; CHECK-BE-WACC-NEXT: stxv v2, 64(r3)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
%1 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -475,6 +733,104 @@ define void @testcse4(ptr %res, i32 %lim, ptr %vc) {
; CHECK-BE-NEXT: bdnz .LBB9_2
; CHECK-BE-NEXT: # %bb.3: # %for.cond.cleanup
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: testcse4:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: cmpwi r4, 1
+; CHECK-WACC-NEXT: bltlr cr0
+; CHECK-WACC-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-WACC-NEXT: clrldi r4, r4, 32
+; CHECK-WACC-NEXT: mtctr r4
+; CHECK-WACC-NEXT: li r4, 0
+; CHECK-WACC-NEXT: li r6, 0
+; CHECK-WACC-NEXT: .p2align 4
+; CHECK-WACC-NEXT: .LBB9_2: # %for.body
+; CHECK-WACC-NEXT: #
+; CHECK-WACC-NEXT: rldic r7, r6, 4, 28
+; CHECK-WACC-NEXT: add r8, r5, r7
+; CHECK-WACC-NEXT: lxvx vs0, r5, r7
+; CHECK-WACC-NEXT: lxv vs1, 16(r8)
+; CHECK-WACC-NEXT: dmxxsetaccz wacc2
+; CHECK-WACC-NEXT: dmxxsetaccz wacc1
+; CHECK-WACC-NEXT: dmxxsetaccz wacc0
+; CHECK-WACC-NEXT: xvf32gerpp wacc2, vs0, vs1
+; CHECK-WACC-NEXT: lxv vs0, 32(r8)
+; CHECK-WACC-NEXT: lxv vs1, 48(r8)
+; CHECK-WACC-NEXT: rldic r7, r4, 6, 26
+; CHECK-WACC-NEXT: addi r4, r4, 3
+; CHECK-WACC-NEXT: addi r6, r6, 6
+; CHECK-WACC-NEXT: xvf32gerpn wacc1, vs0, vs1
+; CHECK-WACC-NEXT: lxv vs0, 64(r8)
+; CHECK-WACC-NEXT: lxv vs1, 80(r8)
+; CHECK-WACC-NEXT: add r8, r3, r7
+; CHECK-WACC-NEXT: xvf32gernp wacc0, vs0, vs1
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc2, 0
+; CHECK-WACC-NEXT: stxvx v3, r3, r7
+; CHECK-WACC-NEXT: stxv v4, 48(r8)
+; CHECK-WACC-NEXT: stxv v5, 32(r8)
+; CHECK-WACC-NEXT: stxv v2, 16(r8)
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc1, 0
+; CHECK-WACC-NEXT: stxv v4, 112(r8)
+; CHECK-WACC-NEXT: stxv v5, 96(r8)
+; CHECK-WACC-NEXT: stxv v2, 80(r8)
+; CHECK-WACC-NEXT: stxv v3, 64(r8)
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 176(r8)
+; CHECK-WACC-NEXT: stxv v5, 160(r8)
+; CHECK-WACC-NEXT: stxv v2, 144(r8)
+; CHECK-WACC-NEXT: stxv v3, 128(r8)
+; CHECK-WACC-NEXT: bdnz .LBB9_2
+; CHECK-WACC-NEXT: # %bb.3: # %for.cond.cleanup
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: testcse4:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: cmpwi r4, 1
+; CHECK-BE-WACC-NEXT: bltlr cr0
+; CHECK-BE-WACC-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-BE-WACC-NEXT: clrldi r4, r4, 32
+; CHECK-BE-WACC-NEXT: mtctr r4
+; CHECK-BE-WACC-NEXT: li r4, 0
+; CHECK-BE-WACC-NEXT: li r6, 0
+; CHECK-BE-WACC-NEXT: .p2align 4
+; CHECK-BE-WACC-NEXT: .LBB9_2: # %for.body
+; CHECK-BE-WACC-NEXT: #
+; CHECK-BE-WACC-NEXT: rldic r7, r6, 4, 28
+; CHECK-BE-WACC-NEXT: add r8, r5, r7
+; CHECK-BE-WACC-NEXT: lxvx vs0, r5, r7
+; CHECK-BE-WACC-NEXT: lxv vs1, 16(r8)
+; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc2
+; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc1
+; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT: xvf32gerpp wacc2, vs0, vs1
+; CHECK-BE-WACC-NEXT: lxv vs0, 32(r8)
+; CHECK-BE-WACC-NEXT: lxv vs1, 48(r8)
+; CHECK-BE-WACC-NEXT: rldic r7, r4, 6, 26
+; CHECK-BE-WACC-NEXT: addi r4, r4, 3
+; CHECK-BE-WACC-NEXT: addi r6, r6, 6
+; CHECK-BE-WACC-NEXT: xvf32gerpn wacc1, vs0, vs1
+; CHECK-BE-WACC-NEXT: lxv vs0, 64(r8)
+; CHECK-BE-WACC-NEXT: lxv vs1, 80(r8)
+; CHECK-BE-WACC-NEXT: add r8, r3, r7
+; CHECK-BE-WACC-NEXT: xvf32gernp wacc0, vs0, vs1
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc2, 0
+; CHECK-BE-WACC-NEXT: stxvx v2, r3, r7
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r8)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r8)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r8)
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc1, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 112(r8)
+; CHECK-BE-WACC-NEXT: stxv v4, 96(r8)
+; CHECK-BE-WACC-NEXT: stxv v3, 80(r8)
+; CHECK-BE-WACC-NEXT: stxv v2, 64(r8)
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 176(r8)
+; CHECK-BE-WACC-NEXT: stxv v4, 160(r8)
+; CHECK-BE-WACC-NEXT: stxv v3, 144(r8)
+; CHECK-BE-WACC-NEXT: stxv v2, 128(r8)
+; CHECK-BE-WACC-NEXT: bdnz .LBB9_2
+; CHECK-BE-WACC-NEXT: # %bb.3: # %for.cond.cleanup
+; CHECK-BE-WACC-NEXT: blr
entry:
%cmp55 = icmp sgt i32 %lim, 0
br i1 %cmp55, label %for.body.preheader, label %for.cond.cleanup
@@ -600,6 +956,71 @@ define void @testRedundantPrimeUnprime(ptr %dst, <16 x i8> %vc) nounwind {
; CHECK-BE-NEXT: ld r0, 16(r1)
; CHECK-BE-NEXT: mtlr r0
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: testRedundantPrimeUnprime:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: mflr r0
+; CHECK-WACC-NEXT: std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-WACC-NEXT: std r0, 16(r1)
+; CHECK-WACC-NEXT: stdu r1, -112(r1)
+; CHECK-WACC-NEXT: dmxxsetaccz wacc0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc0, 0
+; CHECK-WACC-NEXT: stxv v0, 48(r3)
+; CHECK-WACC-NEXT: stxv v1, 32(r3)
+; CHECK-WACC-NEXT: stxv v4, 16(r3)
+; CHECK-WACC-NEXT: stxv v5, 0(r3)
+; CHECK-WACC-NEXT: xvf32gerpp wacc0, v2, v2
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp36, vsp34, wacc0, 0
+; CHECK-WACC-NEXT: mr r30, r3
+; CHECK-WACC-NEXT: stxvp vsp36, 64(r1)
+; CHECK-WACC-NEXT: stxvp vsp34, 32(r1)
+; CHECK-WACC-NEXT: bl testRedundantPrimeUnprimeF@notoc
+; CHECK-WACC-NEXT: lxvp vsp34, 64(r1)
+; CHECK-WACC-NEXT: lxvp vsp36, 32(r1)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 112(r30)
+; CHECK-WACC-NEXT: stxv v5, 96(r30)
+; CHECK-WACC-NEXT: stxv v2, 80(r30)
+; CHECK-WACC-NEXT: stxv v3, 64(r30)
+; CHECK-WACC-NEXT: addi r1, r1, 112
+; CHECK-WACC-NEXT: ld r0, 16(r1)
+; CHECK-WACC-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-WACC-NEXT: mtlr r0
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: testRedundantPrimeUnprime:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: mflr r0
+; CHECK-BE-WACC-NEXT: std r0, 16(r1)
+; CHECK-BE-WACC-NEXT: stdu r1, -192(r1)
+; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT: std r30, 176(r1) # 8-byte Folded Spill
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v1, 48(r3)
+; CHECK-BE-WACC-NEXT: stxv v0, 32(r3)
+; CHECK-BE-WACC-NEXT: stxv v5, 16(r3)
+; CHECK-BE-WACC-NEXT: stxv v4, 0(r3)
+; CHECK-BE-WACC-NEXT: xvf32gerpp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp36, vsp34, wacc0, 0
+; CHECK-BE-WACC-NEXT: mr r30, r3
+; CHECK-BE-WACC-NEXT: stxvp vsp36, 112(r1)
+; CHECK-BE-WACC-NEXT: stxvp vsp34, 144(r1)
+; CHECK-BE-WACC-NEXT: bl testRedundantPrimeUnprimeF
+; CHECK-BE-WACC-NEXT: nop
+; CHECK-BE-WACC-NEXT: lxvp vsp34, 112(r1)
+; CHECK-BE-WACC-NEXT: lxvp vsp36, 144(r1)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 112(r30)
+; CHECK-BE-WACC-NEXT: stxv v4, 96(r30)
+; CHECK-BE-WACC-NEXT: stxv v3, 80(r30)
+; CHECK-BE-WACC-NEXT: stxv v2, 64(r30)
+; CHECK-BE-WACC-NEXT: ld r30, 176(r1) # 8-byte Folded Reload
+; CHECK-BE-WACC-NEXT: addi r1, r1, 192
+; CHECK-BE-WACC-NEXT: ld r0, 16(r1)
+; CHECK-BE-WACC-NEXT: mtlr r0
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
store <512 x i1> %0, ptr %dst, align 64
@@ -646,6 +1067,38 @@ define void @test_ldst_1(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, p
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test_ldst_1:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: plxvp vsp36, 8(r4), 0
+; CHECK-WACC-NEXT: pmxvf64gernn wacc0, vsp36, v2, 0, 0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test_ldst_1:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: plxvp vsp36, 8(r4), 0
+; CHECK-BE-WACC-NEXT: pmxvf64gernn wacc0, vsp36, v2, 0, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = getelementptr i8, ptr %vpp, i64 8
@@ -688,6 +1141,38 @@ define void @test_ldst_2(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, p
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test_ldst_2:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: lxvp vsp36, 0(r4)
+; CHECK-WACC-NEXT: xvf64gernp wacc0, vsp36, v2
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test_ldst_2:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: lxvp vsp36, 0(r4)
+; CHECK-BE-WACC-NEXT: xvf64gernp wacc0, vsp36, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr %vpp)
@@ -729,6 +1214,38 @@ define void @test_ldst_3(ptr nocapture readonly %vqp, i64 %offs, ptr %vpp, <16 x
; CHECK-BE-NEXT: stxv vs3, 48(r9)
; CHECK-BE-NEXT: stxv vs2, 32(r9)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test_ldst_3:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: lxvp vsp36, 0(r5)
+; CHECK-WACC-NEXT: xvf64gernp wacc0, vsp36, v2
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r9)
+; CHECK-WACC-NEXT: stxv v5, 32(r9)
+; CHECK-WACC-NEXT: stxv v2, 16(r9)
+; CHECK-WACC-NEXT: stxv v3, 0(r9)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test_ldst_3:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: lxvp vsp36, 0(r5)
+; CHECK-BE-WACC-NEXT: xvf64gernp wacc0, vsp36, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r9)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r9)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r9)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r9)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr %vpp)
diff --git a/llvm/test/CodeGen/PowerPC/mma-outer-product.ll b/llvm/test/CodeGen/PowerPC/mma-outer-product.ll
index ac6ad41..ff860b8 100644
--- a/llvm/test/CodeGen/PowerPC/mma-outer-product.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-outer-product.ll
@@ -5,6 +5,12 @@
; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
; RUN: -mcpu=pwr10 -enable-subreg-liveness -ppc-asm-full-reg-names \
; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -mcpu=future -enable-subreg-liveness -ppc-asm-full-reg-names \
+; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-WACC
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN: -mcpu=future -enable-subreg-liveness -ppc-asm-full-reg-names \
+; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC
declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
declare <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8>, <16 x i8>)
@@ -56,6 +62,46 @@ define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i
; CHECK-BE-NEXT: stxv vs3, 48(r3)
; CHECK-BE-NEXT: stxv vs2, 32(r3)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: intrinsics1:
+; CHECK-WACC: # %bb.0:
+; CHECK-WACC-NEXT: vmr v1, v4
+; CHECK-WACC-NEXT: vmr v4, v3
+; CHECK-WACC-NEXT: vmr v0, v2
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: xvi4ger8pp wacc0, v2, v4
+; CHECK-WACC-NEXT: ld r3, 96(r1)
+; CHECK-WACC-NEXT: xvf16ger2pp wacc0, v0, v1
+; CHECK-WACC-NEXT: vmr v3, v2
+; CHECK-WACC-NEXT: vmr v2, v5
+; CHECK-WACC-NEXT: pmxvf32gerpn wacc0, v4, v5, 0, 0
+; CHECK-WACC-NEXT: pmxvf64gernp wacc0, vsp34, v0, 0, 0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r3)
+; CHECK-WACC-NEXT: stxv v5, 32(r3)
+; CHECK-WACC-NEXT: stxv v2, 16(r3)
+; CHECK-WACC-NEXT: stxv v3, 0(r3)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: intrinsics1:
+; CHECK-BE-WACC: # %bb.0:
+; CHECK-BE-WACC-NEXT: vmr v1, v4
+; CHECK-BE-WACC-NEXT: vmr v4, v3
+; CHECK-BE-WACC-NEXT: vmr v0, v2
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: xvi4ger8pp wacc0, v2, v4
+; CHECK-BE-WACC-NEXT: ld r3, 112(r1)
+; CHECK-BE-WACC-NEXT: xvf16ger2pp wacc0, v0, v1
+; CHECK-BE-WACC-NEXT: vmr v3, v2
+; CHECK-BE-WACC-NEXT: vmr v2, v5
+; CHECK-BE-WACC-NEXT: pmxvf32gerpn wacc0, v4, v5, 0, 0
+; CHECK-BE-WACC-NEXT: pmxvf64gernp wacc0, vsp34, v0, 0, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT: blr
%1 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc1, <16 x i8> %vc3, <16 x i8> %vc2, <16 x i8> %vc4)
%2 = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1> %1, <16 x i8> %vc1, <16 x i8> %vc2)
%3 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %2, <16 x i8> %vc1, <16 x i8> %vc3)
@@ -115,6 +161,46 @@ define void @intrinsics2(ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr %ptr4, ptr %ptr) {
; CHECK-BE-NEXT: stxv vs2, 0(r5)
; CHECK-BE-NEXT: stxv vs3, 0(r6)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: intrinsics2:
+; CHECK-WACC: # %bb.0:
+; CHECK-WACC-NEXT: lxv v2, 0(r3)
+; CHECK-WACC-NEXT: lxv v4, 0(r5)
+; CHECK-WACC-NEXT: lxv v3, 0(r4)
+; CHECK-WACC-NEXT: lxv v5, 0(r6)
+; CHECK-WACC-NEXT: vmr v1, v2
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-WACC-NEXT: xvi8ger4pp wacc0, v2, v3
+; CHECK-WACC-NEXT: xvf16ger2pn wacc0, v2, v4
+; CHECK-WACC-NEXT: vmr v0, v5
+; CHECK-WACC-NEXT: pmxvf32gernn wacc0, v3, v5, 0, 0
+; CHECK-WACC-NEXT: pmxvf64gernn wacc0, vsp32, v2, 0, 0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v5, 0(r3)
+; CHECK-WACC-NEXT: stxv v4, 0(r4)
+; CHECK-WACC-NEXT: stxv v3, 0(r5)
+; CHECK-WACC-NEXT: stxv v2, 0(r6)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: intrinsics2:
+; CHECK-BE-WACC: # %bb.0:
+; CHECK-BE-WACC-NEXT: lxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 0(r5)
+; CHECK-BE-WACC-NEXT: lxv v3, 0(r4)
+; CHECK-BE-WACC-NEXT: lxv v5, 0(r6)
+; CHECK-BE-WACC-NEXT: vmr v1, v2
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-BE-WACC-NEXT: xvi8ger4pp wacc0, v2, v3
+; CHECK-BE-WACC-NEXT: xvf16ger2pn wacc0, v2, v4
+; CHECK-BE-WACC-NEXT: vmr v0, v5
+; CHECK-BE-WACC-NEXT: pmxvf32gernn wacc0, v3, v5, 0, 0
+; CHECK-BE-WACC-NEXT: pmxvf64gernn wacc0, vsp32, v2, 0, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT: stxv v3, 0(r4)
+; CHECK-BE-WACC-NEXT: stxv v4, 0(r5)
+; CHECK-BE-WACC-NEXT: stxv v5, 0(r6)
+; CHECK-BE-WACC-NEXT: blr
%vc1 = load <16 x i8>, ptr %ptr1, align 16
%vc2 = load <16 x i8>, ptr %ptr2, align 16
%vc3 = load <16 x i8>, ptr %ptr3, align 16
@@ -157,6 +243,26 @@ define void @test1(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test1:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: xvi4ger8 wacc0, v2, v2
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test1:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: xvi4ger8 wacc0, v2, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8(<16 x i8> %vc, <16 x i8> %vc)
store <512 x i1> %0, ptr %resp, align 64
@@ -196,6 +302,36 @@ define void @test2(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test2:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: xvi4ger8pp wacc0, v2, v2
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test2:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: xvi4ger8pp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -226,6 +362,26 @@ define void @test3(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test3:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: pmxvi4ger8 wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test3:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: pmxvi4ger8 wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi4ger8(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
store <512 x i1> %0, ptr %resp, align 64
@@ -265,6 +421,36 @@ define void @test4(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test4:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: pmxvi4ger8pp wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test4:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: pmxvi4ger8pp wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi4ger8pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -295,6 +481,26 @@ define void @test5(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test5:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: xvi8ger4 wacc0, v2, v2
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test5:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: xvi8ger4 wacc0, v2, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4(<16 x i8> %vc, <16 x i8> %vc)
store <512 x i1> %0, ptr %resp, align 64
@@ -334,6 +540,36 @@ define void @test6(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test6:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: xvi8ger4pp wacc0, v2, v2
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test6:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: xvi8ger4pp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -364,6 +600,26 @@ define void @test7(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test7:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: pmxvi8ger4 wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test7:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: pmxvi8ger4 wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
store <512 x i1> %0, ptr %resp, align 64
@@ -403,6 +659,36 @@ define void @test8(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test8:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: pmxvi8ger4pp wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test8:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: pmxvi8ger4pp wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -433,6 +719,26 @@ define void @test9(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test9:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: xvi16ger2s wacc0, v2, v2
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test9:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: xvi16ger2s wacc0, v2, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2s(<16 x i8> %vc, <16 x i8> %vc)
store <512 x i1> %0, ptr %resp, align 64
@@ -472,6 +778,36 @@ define void @test10(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test10:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: xvi16ger2spp wacc0, v2, v2
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test10:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: xvi16ger2spp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2spp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -502,6 +838,26 @@ define void @test11(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test11:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: pmxvi16ger2s wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test11:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: pmxvi16ger2s wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2s(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
store <512 x i1> %0, ptr %resp, align 64
@@ -541,6 +897,36 @@ define void @test12(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test12:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: pmxvi16ger2spp wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test12:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: pmxvi16ger2spp wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2spp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -571,6 +957,26 @@ define void @test13(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test13:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: xvf16ger2 wacc0, v2, v2
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test13:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: xvf16ger2 wacc0, v2, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2(<16 x i8> %vc, <16 x i8> %vc)
store <512 x i1> %0, ptr %resp, align 64
@@ -610,6 +1016,36 @@ define void @test14(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test14:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: xvf16ger2pp wacc0, v2, v2
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test14:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: xvf16ger2pp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -650,6 +1086,36 @@ define void @test15(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test15:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: xvf16ger2pn wacc0, v2, v2
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test15:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: xvf16ger2pn wacc0, v2, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -690,6 +1156,36 @@ define void @test16(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test16:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: xvf16ger2np wacc0, v2, v2
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test16:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: xvf16ger2np wacc0, v2, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2np(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -730,6 +1226,36 @@ define void @test17(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test17:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: xvf16ger2nn wacc0, v2, v2
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test17:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: xvf16ger2nn wacc0, v2, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2nn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -760,6 +1286,26 @@ define void @test18(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test18:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: pmxvf16ger2 wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test18:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: pmxvf16ger2 wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
store <512 x i1> %0, ptr %resp, align 64
@@ -799,6 +1345,36 @@ define void @test19(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test19:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: pmxvf16ger2pp wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test19:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: pmxvf16ger2pp wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -839,6 +1415,36 @@ define void @test20(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test20:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: pmxvf16ger2pn wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test20:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: pmxvf16ger2pn wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -879,6 +1485,36 @@ define void @test21(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test21:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: pmxvf16ger2np wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test21:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: pmxvf16ger2np wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2np(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -919,6 +1555,36 @@ define void @test22(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test22:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: pmxvf16ger2nn wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test22:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: pmxvf16ger2nn wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2nn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -949,6 +1615,26 @@ define void @test23(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test23:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: xvf32ger wacc0, v2, v2
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test23:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: xvf32ger wacc0, v2, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = tail call <512 x i1> @llvm.ppc.mma.xvf32ger(<16 x i8> %vc, <16 x i8> %vc)
store <512 x i1> %0, ptr %resp, align 64
@@ -988,6 +1674,36 @@ define void @test24(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test24:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: xvf32gerpp wacc0, v2, v2
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test24:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: xvf32gerpp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -1028,6 +1744,36 @@ define void @test25(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test25:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: xvf32gerpn wacc0, v2, v2
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test25:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: xvf32gerpn wacc0, v2, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -1068,6 +1814,36 @@ define void @test26(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test26:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: xvf32gernp wacc0, v2, v2
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test26:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: xvf32gernp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -1108,6 +1884,36 @@ define void @test27(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test27:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: xvf32gernn wacc0, v2, v2
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test27:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: xvf32gernn wacc0, v2, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gernn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -1138,6 +1944,26 @@ define void @test28(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test28:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: pmxvf32ger wacc0, v2, v2, 0, 0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test28:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: pmxvf32ger wacc0, v2, v2, 0, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32ger(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0)
store <512 x i1> %0, ptr %resp, align 64
@@ -1177,6 +2003,36 @@ define void @test29(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test29:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: pmxvf32gerpp wacc0, v2, v2, 0, 0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test29:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: pmxvf32gerpp wacc0, v2, v2, 0, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0)
@@ -1217,6 +2073,36 @@ define void @test30(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test30:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: pmxvf32gerpn wacc0, v2, v2, 0, 0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test30:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: pmxvf32gerpn wacc0, v2, v2, 0, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gerpn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0)
@@ -1257,6 +2143,36 @@ define void @test31(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test31:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: pmxvf32gernp wacc0, v2, v2, 0, 0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test31:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: pmxvf32gernp wacc0, v2, v2, 0, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gernp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0)
@@ -1297,6 +2213,36 @@ define void @test32(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test32:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: pmxvf32gernn wacc0, v2, v2, 0, 0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test32:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: pmxvf32gernn wacc0, v2, v2, 0, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gernn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0)
@@ -1331,6 +2277,30 @@ define void @test33(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test33:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v4, 16(r4)
+; CHECK-WACC-NEXT: lxv v5, 0(r4)
+; CHECK-WACC-NEXT: xvf64ger wacc0, vsp36, v2
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test33:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT: lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT: xvf64ger wacc0, vsp36, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <256 x i1>, ptr %vpp, align 32
%1 = tail call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> %0, <16 x i8> %vc)
@@ -1375,6 +2345,40 @@ define void @test34(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test34:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: lxv v4, 16(r4)
+; CHECK-WACC-NEXT: lxv v5, 0(r4)
+; CHECK-WACC-NEXT: xvf64gerpp wacc0, vsp36, v2
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test34:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT: lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT: xvf64gerpp wacc0, vsp36, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = load <256 x i1>, ptr %vpp, align 32
@@ -1420,6 +2424,40 @@ define void @test35(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test35:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: lxv v4, 16(r4)
+; CHECK-WACC-NEXT: lxv v5, 0(r4)
+; CHECK-WACC-NEXT: xvf64gerpn wacc0, vsp36, v2
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test35:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT: lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT: xvf64gerpn wacc0, vsp36, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = load <256 x i1>, ptr %vpp, align 32
@@ -1465,6 +2503,40 @@ define void @test36(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test36:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: lxv v4, 16(r4)
+; CHECK-WACC-NEXT: lxv v5, 0(r4)
+; CHECK-WACC-NEXT: xvf64gernp wacc0, vsp36, v2
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test36:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT: lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT: xvf64gernp wacc0, vsp36, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = load <256 x i1>, ptr %vpp, align 32
@@ -1510,6 +2582,40 @@ define void @test37(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test37:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: lxv v4, 16(r4)
+; CHECK-WACC-NEXT: lxv v5, 0(r4)
+; CHECK-WACC-NEXT: xvf64gernn wacc0, vsp36, v2
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test37:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT: lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT: xvf64gernn wacc0, vsp36, v2
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = load <256 x i1>, ptr %vpp, align 32
@@ -1545,6 +2651,30 @@ define void @test38(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test38:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v4, 16(r4)
+; CHECK-WACC-NEXT: lxv v5, 0(r4)
+; CHECK-WACC-NEXT: pmxvf64ger wacc0, vsp36, v2, 0, 0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test38:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT: lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT: pmxvf64ger wacc0, vsp36, v2, 0, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <256 x i1>, ptr %vpp, align 32
%1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf64ger(<256 x i1> %0, <16 x i8> %vc, i32 0, i32 0)
@@ -1589,6 +2719,40 @@ define void @test39(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test39:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: lxv v4, 16(r4)
+; CHECK-WACC-NEXT: lxv v5, 0(r4)
+; CHECK-WACC-NEXT: pmxvf64gerpp wacc0, vsp36, v2, 0, 0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test39:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT: lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT: pmxvf64gerpp wacc0, vsp36, v2, 0, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = load <256 x i1>, ptr %vpp, align 32
@@ -1634,6 +2798,40 @@ define void @test40(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test40:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: lxv v4, 16(r4)
+; CHECK-WACC-NEXT: lxv v5, 0(r4)
+; CHECK-WACC-NEXT: pmxvf64gerpn wacc0, vsp36, v2, 0, 0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test40:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT: lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT: pmxvf64gerpn wacc0, vsp36, v2, 0, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = load <256 x i1>, ptr %vpp, align 32
@@ -1679,6 +2877,40 @@ define void @test41(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test41:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: lxv v4, 16(r4)
+; CHECK-WACC-NEXT: lxv v5, 0(r4)
+; CHECK-WACC-NEXT: pmxvf64gernp wacc0, vsp36, v2, 0, 0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test41:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT: lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT: pmxvf64gernp wacc0, vsp36, v2, 0, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = load <256 x i1>, ptr %vpp, align 32
@@ -1724,6 +2956,40 @@ define void @test42(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
; CHECK-BE-NEXT: stxv vs3, 48(r7)
; CHECK-BE-NEXT: stxv vs2, 32(r7)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: test42:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v5, 0(r3)
+; CHECK-WACC-NEXT: lxv v1, 32(r3)
+; CHECK-WACC-NEXT: lxv v4, 16(r3)
+; CHECK-WACC-NEXT: lxv v0, 48(r3)
+; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT: lxv v4, 16(r4)
+; CHECK-WACC-NEXT: lxv v5, 0(r4)
+; CHECK-WACC-NEXT: pmxvf64gernn wacc0, vsp36, v2, 0, 0
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r7)
+; CHECK-WACC-NEXT: stxv v5, 32(r7)
+; CHECK-WACC-NEXT: stxv v2, 16(r7)
+; CHECK-WACC-NEXT: stxv v3, 0(r7)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: test42:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT: lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT: lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT: lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT: pmxvf64gernn wacc0, vsp36, v2, 0, 0
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <512 x i1>, ptr %vqp, align 64
%1 = load <256 x i1>, ptr %vpp, align 32
diff --git a/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll b/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll
index 89e5147..37d0e69 100644
--- a/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll
@@ -5,6 +5,12 @@
; RUN: llc -O3 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \
; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -O3 -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -mcpu=future -ppc-asm-full-reg-names \
+; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-WACC
+; RUN: llc -O3 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN: -mcpu=future -ppc-asm-full-reg-names \
+; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC
declare <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8>, <16 x i8>)
declare <512 x i1> @llvm.ppc.mma.xxsetaccz()
@@ -64,6 +70,60 @@ define void @testPHI1(ptr %Dst, ptr %Src, i32 signext %Len) {
; CHECK-BE-NEXT: stxv vs2, 32(r3)
; CHECK-BE-NEXT: stxv vs3, 48(r3)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: testPHI1:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: cmpwi r5, 3
+; CHECK-WACC-NEXT: dmxxsetaccz wacc0
+; CHECK-WACC-NEXT: blt cr0, .LBB0_3
+; CHECK-WACC-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-WACC-NEXT: clrldi r5, r5, 32
+; CHECK-WACC-NEXT: addi r5, r5, -2
+; CHECK-WACC-NEXT: lxv v2, 0(r4)
+; CHECK-WACC-NEXT: lxv v3, 16(r4)
+; CHECK-WACC-NEXT: mtctr r5
+; CHECK-WACC-NEXT: addi r4, r4, 32
+; CHECK-WACC-NEXT: .p2align 4
+; CHECK-WACC-NEXT: .LBB0_2: # %for.body
+; CHECK-WACC-NEXT: #
+; CHECK-WACC-NEXT: lxv vs0, 0(r4)
+; CHECK-WACC-NEXT: addi r4, r4, 16
+; CHECK-WACC-NEXT: xvf64gerpp wacc0, vsp34, vs0
+; CHECK-WACC-NEXT: bdnz .LBB0_2
+; CHECK-WACC-NEXT: .LBB0_3: # %for.cond.cleanup
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v5, 0(r3)
+; CHECK-WACC-NEXT: stxv v4, 16(r3)
+; CHECK-WACC-NEXT: stxv v3, 32(r3)
+; CHECK-WACC-NEXT: stxv v2, 48(r3)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: testPHI1:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: cmpwi r5, 3
+; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT: blt cr0, .LBB0_3
+; CHECK-BE-WACC-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-BE-WACC-NEXT: clrldi r5, r5, 32
+; CHECK-BE-WACC-NEXT: addi r5, r5, -2
+; CHECK-BE-WACC-NEXT: lxv v2, 0(r4)
+; CHECK-BE-WACC-NEXT: lxv v3, 16(r4)
+; CHECK-BE-WACC-NEXT: mtctr r5
+; CHECK-BE-WACC-NEXT: addi r4, r4, 32
+; CHECK-BE-WACC-NEXT: .p2align 4
+; CHECK-BE-WACC-NEXT: .LBB0_2: # %for.body
+; CHECK-BE-WACC-NEXT: #
+; CHECK-BE-WACC-NEXT: lxv vs0, 0(r4)
+; CHECK-BE-WACC-NEXT: addi r4, r4, 16
+; CHECK-BE-WACC-NEXT: xvf64gerpp wacc0, vsp34, vs0
+; CHECK-BE-WACC-NEXT: bdnz .LBB0_2
+; CHECK-BE-WACC-NEXT: .LBB0_3: # %for.cond.cleanup
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <16 x i8>, ptr %Src, align 16
%arrayidx1 = getelementptr inbounds <16 x i8>, ptr %Src, i64 1
@@ -161,6 +221,62 @@ define dso_local void @testPHI2(ptr %Dst, ptr %Src, i32 signext %Len) {
; CHECK-BE-NEXT: stxv vs2, 32(r3)
; CHECK-BE-NEXT: stxv vs3, 48(r3)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: testPHI2:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: lxv v2, 0(r4)
+; CHECK-WACC-NEXT: lxv v3, 16(r4)
+; CHECK-WACC-NEXT: lxv vs0, 32(r4)
+; CHECK-WACC-NEXT: cmpwi r5, 4
+; CHECK-WACC-NEXT: xvf64ger wacc0, vsp34, vs0
+; CHECK-WACC-NEXT: blt cr0, .LBB1_3
+; CHECK-WACC-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-WACC-NEXT: clrldi r5, r5, 32
+; CHECK-WACC-NEXT: addi r5, r5, -3
+; CHECK-WACC-NEXT: mtctr r5
+; CHECK-WACC-NEXT: addi r4, r4, 48
+; CHECK-WACC-NEXT: .p2align 4
+; CHECK-WACC-NEXT: .LBB1_2: # %for.body
+; CHECK-WACC-NEXT: #
+; CHECK-WACC-NEXT: lxv vs0, 0(r4)
+; CHECK-WACC-NEXT: addi r4, r4, 16
+; CHECK-WACC-NEXT: xvf64gerpp wacc0, vsp34, vs0
+; CHECK-WACC-NEXT: bdnz .LBB1_2
+; CHECK-WACC-NEXT: .LBB1_3: # %for.cond.cleanup
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v5, 0(r3)
+; CHECK-WACC-NEXT: stxv v4, 16(r3)
+; CHECK-WACC-NEXT: stxv v3, 32(r3)
+; CHECK-WACC-NEXT: stxv v2, 48(r3)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: testPHI2:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: lxv v2, 0(r4)
+; CHECK-BE-WACC-NEXT: lxv v3, 16(r4)
+; CHECK-BE-WACC-NEXT: lxv vs0, 32(r4)
+; CHECK-BE-WACC-NEXT: cmpwi r5, 4
+; CHECK-BE-WACC-NEXT: xvf64ger wacc0, vsp34, vs0
+; CHECK-BE-WACC-NEXT: blt cr0, .LBB1_3
+; CHECK-BE-WACC-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-BE-WACC-NEXT: clrldi r5, r5, 32
+; CHECK-BE-WACC-NEXT: addi r5, r5, -3
+; CHECK-BE-WACC-NEXT: mtctr r5
+; CHECK-BE-WACC-NEXT: addi r4, r4, 48
+; CHECK-BE-WACC-NEXT: .p2align 4
+; CHECK-BE-WACC-NEXT: .LBB1_2: # %for.body
+; CHECK-BE-WACC-NEXT: #
+; CHECK-BE-WACC-NEXT: lxv vs0, 0(r4)
+; CHECK-BE-WACC-NEXT: addi r4, r4, 16
+; CHECK-BE-WACC-NEXT: xvf64gerpp wacc0, vsp34, vs0
+; CHECK-BE-WACC-NEXT: bdnz .LBB1_2
+; CHECK-BE-WACC-NEXT: .LBB1_3: # %for.cond.cleanup
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT: blr
entry:
%0 = load <16 x i8>, ptr %Src, align 16
%arrayidx1 = getelementptr inbounds <16 x i8>, ptr %Src, i64 1
@@ -229,6 +345,28 @@ define void @testImplicitDef(ptr %ptr) {
; CHECK-BE-NEXT: xxmfacc acc0
; CHECK-BE-NEXT: stxv vs3, 0(r3)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: testImplicitDef:
+; CHECK-WACC: # %bb.0: # %label1
+; CHECK-WACC-NEXT: # implicit-def: $wacc0
+; CHECK-WACC-NEXT: bc 12, 4*cr5+lt, .LBB2_2
+; CHECK-WACC-NEXT: # %bb.1: # %label2
+; CHECK-WACC-NEXT: xvf64gerpp wacc0, vsp34, vs0
+; CHECK-WACC-NEXT: .LBB2_2: # %label3
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: stxv v2, 0(r3)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: testImplicitDef:
+; CHECK-BE-WACC: # %bb.0: # %label1
+; CHECK-BE-WACC-NEXT: # implicit-def: $wacc0
+; CHECK-BE-WACC-NEXT: bc 12, 4*cr5+lt, .LBB2_2
+; CHECK-BE-WACC-NEXT: # %bb.1: # %label2
+; CHECK-BE-WACC-NEXT: xvf64gerpp wacc0, vsp34, vs0
+; CHECK-BE-WACC-NEXT: .LBB2_2: # %label3
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 0(r3)
+; CHECK-BE-WACC-NEXT: blr
label1:
br i1 undef, label %label3, label %label2
@@ -312,6 +450,70 @@ define dso_local signext i32 @testNestedPHI(i32 signext %cond, i32 signext %coun
; CHECK-BE-NEXT: stxv vs3, 48(r5)
; CHECK-BE-NEXT: stxv vs2, 32(r5)
; CHECK-BE-NEXT: blr
+;
+; CHECK-WACC-LABEL: testNestedPHI:
+; CHECK-WACC: # %bb.0: # %entry
+; CHECK-WACC-NEXT: cmplwi r3, 0
+; CHECK-WACC-NEXT: beq cr0, .LBB3_2
+; CHECK-WACC-NEXT: # %bb.1: # %if.then
+; CHECK-WACC-NEXT: xvf32gernp wacc0, v2, v2
+; CHECK-WACC-NEXT: cmpwi r4, 1
+; CHECK-WACC-NEXT: bge cr0, .LBB3_3
+; CHECK-WACC-NEXT: b .LBB3_5
+; CHECK-WACC-NEXT: .LBB3_2:
+; CHECK-WACC-NEXT: # implicit-def: $wacc0
+; CHECK-WACC-NEXT: cmpwi r4, 1
+; CHECK-WACC-NEXT: blt cr0, .LBB3_5
+; CHECK-WACC-NEXT: .LBB3_3: # %for.body.preheader
+; CHECK-WACC-NEXT: addi r3, r4, -1
+; CHECK-WACC-NEXT: clrldi r3, r3, 32
+; CHECK-WACC-NEXT: addi r3, r3, 1
+; CHECK-WACC-NEXT: mtctr r3
+; CHECK-WACC-NEXT: .p2align 4
+; CHECK-WACC-NEXT: .LBB3_4: # %for.body
+; CHECK-WACC-NEXT: #
+; CHECK-WACC-NEXT: xvf32gernp wacc0, v2, v2
+; CHECK-WACC-NEXT: bdnz .LBB3_4
+; CHECK-WACC-NEXT: .LBB3_5: # %for.cond.cleanup
+; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT: li r3, 0
+; CHECK-WACC-NEXT: stxv v4, 48(r5)
+; CHECK-WACC-NEXT: stxv v5, 32(r5)
+; CHECK-WACC-NEXT: stxv v2, 16(r5)
+; CHECK-WACC-NEXT: stxv v3, 0(r5)
+; CHECK-WACC-NEXT: blr
+;
+; CHECK-BE-WACC-LABEL: testNestedPHI:
+; CHECK-BE-WACC: # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT: cmplwi r3, 0
+; CHECK-BE-WACC-NEXT: beq cr0, .LBB3_2
+; CHECK-BE-WACC-NEXT: # %bb.1: # %if.then
+; CHECK-BE-WACC-NEXT: xvf32gernp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT: cmpwi r4, 1
+; CHECK-BE-WACC-NEXT: bge cr0, .LBB3_3
+; CHECK-BE-WACC-NEXT: b .LBB3_5
+; CHECK-BE-WACC-NEXT: .LBB3_2:
+; CHECK-BE-WACC-NEXT: # implicit-def: $wacc0
+; CHECK-BE-WACC-NEXT: cmpwi r4, 1
+; CHECK-BE-WACC-NEXT: blt cr0, .LBB3_5
+; CHECK-BE-WACC-NEXT: .LBB3_3: # %for.body.preheader
+; CHECK-BE-WACC-NEXT: addi r3, r4, -1
+; CHECK-BE-WACC-NEXT: clrldi r3, r3, 32
+; CHECK-BE-WACC-NEXT: addi r3, r3, 1
+; CHECK-BE-WACC-NEXT: mtctr r3
+; CHECK-BE-WACC-NEXT: .p2align 4
+; CHECK-BE-WACC-NEXT: .LBB3_4: # %for.body
+; CHECK-BE-WACC-NEXT: #
+; CHECK-BE-WACC-NEXT: xvf32gernp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT: bdnz .LBB3_4
+; CHECK-BE-WACC-NEXT: .LBB3_5: # %for.cond.cleanup
+; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT: li r3, 0
+; CHECK-BE-WACC-NEXT: stxv v5, 48(r5)
+; CHECK-BE-WACC-NEXT: stxv v4, 32(r5)
+; CHECK-BE-WACC-NEXT: stxv v3, 16(r5)
+; CHECK-BE-WACC-NEXT: stxv v2, 0(r5)
+; CHECK-BE-WACC-NEXT: blr
entry:
%tobool.not = icmp eq i32 %cond, 0
br i1 %tobool.not, label %if.end, label %if.then
diff --git a/llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll b/llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll
index 291cf97..929bf5f 100644
--- a/llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll
+++ b/llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll
@@ -1,5 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -verify-machineinstrs -mcpu=ppc -mtriple=powerpc64-ibm-aix < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mcpu=future \
+; RUN: -mtriple=powerpc64-ibm-aix < %s | FileCheck %s --check-prefix=CHECK-WACC
target datalayout = "E-m:a-Fi64-i64:64-n32:64-S128-v256:256:256-v512:512:512"
@@ -38,6 +40,43 @@ define void @baz(i64 %arg) local_unnamed_addr #0 {
; CHECK-NEXT: xxswapd 0, 0
; CHECK-NEXT: stxv 0, 0(3)
; CHECK-NEXT: blr
+;
+; CHECK-WACC-LABEL: baz:
+; CHECK-WACC: # %bb.0: # %bb
+; CHECK-WACC-NEXT: dmxxextfdmr512 34, 36, 0, 0
+; CHECK-WACC-NEXT: xxmrgld 1, 34, 36
+; CHECK-WACC-NEXT: xxswapd 2, 1
+; CHECK-WACC-NEXT: xxlxor 0, 0, 0
+; CHECK-WACC-NEXT: xvnegdp 1, 1
+; CHECK-WACC-NEXT: xvnegdp 2, 2
+; CHECK-WACC-NEXT: xvsubdp 1, 1, 0
+; CHECK-WACC-NEXT: xvsubdp 2, 2, 37
+; CHECK-WACC-NEXT: xvmuldp 1, 1, 0
+; CHECK-WACC-NEXT: xvmuldp 2, 2, 0
+; CHECK-WACC-NEXT: xvmaddadp 1, 0, 0
+; CHECK-WACC-NEXT: xvmaddadp 2, 0, 0
+; CHECK-WACC-NEXT: stxv 1, 0(3)
+; CHECK-WACC-NEXT: stxv 2, 0(3)
+; CHECK-WACC-NEXT: # implicit-def: $wacc0
+; CHECK-WACC-NEXT: bc 12, 20, L..BB0_2
+; CHECK-WACC-NEXT: # %bb.1: # %bb10
+; CHECK-WACC-NEXT: xvf64gerpp 0, 34, 0
+; CHECK-WACC-NEXT: L..BB0_2: # %bb12
+; CHECK-WACC-NEXT: cmpdi 3, 0
+; CHECK-WACC-NEXT: .align 4
+; CHECK-WACC-NEXT: L..BB0_3: # %bb13
+; CHECK-WACC-NEXT: #
+; CHECK-WACC-NEXT: bc 4, 2, L..BB0_3
+; CHECK-WACC-NEXT: # %bb.4: # %bb14
+; CHECK-WACC-NEXT: dmxxextfdmr512 34, 36, 0, 0
+; CHECK-WACC-NEXT: xxlxor 0, 0, 0
+; CHECK-WACC-NEXT: xvsubdp 1, 0, 35
+; CHECK-WACC-NEXT: xxlxor 2, 2, 2
+; CHECK-WACC-NEXT: xvmaddadp 2, 1, 2
+; CHECK-WACC-NEXT: xvadddp 0, 2, 0
+; CHECK-WACC-NEXT: xxswapd 0, 0
+; CHECK-WACC-NEXT: stxv 0, 0(3)
+; CHECK-WACC-NEXT: blr
bb:
%call = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> poison)
%extractvalue = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %call, 0
diff --git a/llvm/test/CodeGen/PowerPC/vec_rounding.ll b/llvm/test/CodeGen/PowerPC/vec_rounding.ll
index 2f16a43..438c8eb 100644
--- a/llvm/test/CodeGen/PowerPC/vec_rounding.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_rounding.ll
@@ -1,172 +1,251 @@
-; RUN: llc -verify-machineinstrs -mcpu=pwr6 -mattr=+altivec < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN: -mcpu=pwr6 -mattr=+altivec < %s | FileCheck %s
; Check vector round to single-precision toward -infinity (vrfim)
; instruction generation using Altivec.
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-linux-gnu"
-
declare <2 x double> @llvm.floor.v2f64(<2 x double> %p)
define <2 x double> @floor_v2f64(<2 x double> %p)
+; CHECK-LABEL: floor_v2f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: frim 1, 1
+; CHECK-NEXT: frim 2, 2
+; CHECK-NEXT: blr
{
%t = call <2 x double> @llvm.floor.v2f64(<2 x double> %p)
ret <2 x double> %t
}
-; CHECK-LABEL: floor_v2f64:
-; CHECK: frim
-; CHECK: frim
declare <4 x double> @llvm.floor.v4f64(<4 x double> %p)
define <4 x double> @floor_v4f64(<4 x double> %p)
+; CHECK-LABEL: floor_v4f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: frim 1, 1
+; CHECK-NEXT: frim 2, 2
+; CHECK-NEXT: frim 3, 3
+; CHECK-NEXT: frim 4, 4
+; CHECK-NEXT: blr
{
%t = call <4 x double> @llvm.floor.v4f64(<4 x double> %p)
ret <4 x double> %t
}
-; CHECK-LABEL: floor_v4f64:
-; CHECK: frim
-; CHECK: frim
-; CHECK: frim
-; CHECK: frim
declare <2 x double> @llvm.ceil.v2f64(<2 x double> %p)
define <2 x double> @ceil_v2f64(<2 x double> %p)
+; CHECK-LABEL: ceil_v2f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: frip 1, 1
+; CHECK-NEXT: frip 2, 2
+; CHECK-NEXT: blr
{
%t = call <2 x double> @llvm.ceil.v2f64(<2 x double> %p)
ret <2 x double> %t
}
-; CHECK-LABEL: ceil_v2f64:
-; CHECK: frip
-; CHECK: frip
declare <4 x double> @llvm.ceil.v4f64(<4 x double> %p)
define <4 x double> @ceil_v4f64(<4 x double> %p)
+; CHECK-LABEL: ceil_v4f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: frip 1, 1
+; CHECK-NEXT: frip 2, 2
+; CHECK-NEXT: frip 3, 3
+; CHECK-NEXT: frip 4, 4
+; CHECK-NEXT: blr
{
%t = call <4 x double> @llvm.ceil.v4f64(<4 x double> %p)
ret <4 x double> %t
}
-; CHECK-LABEL: ceil_v4f64:
-; CHECK: frip
-; CHECK: frip
-; CHECK: frip
-; CHECK: frip
declare <2 x double> @llvm.trunc.v2f64(<2 x double> %p)
define <2 x double> @trunc_v2f64(<2 x double> %p)
+; CHECK-LABEL: trunc_v2f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: friz 1, 1
+; CHECK-NEXT: friz 2, 2
+; CHECK-NEXT: blr
{
%t = call <2 x double> @llvm.trunc.v2f64(<2 x double> %p)
ret <2 x double> %t
}
-; CHECK-LABEL: trunc_v2f64:
-; CHECK: friz
-; CHECK: friz
declare <4 x double> @llvm.trunc.v4f64(<4 x double> %p)
define <4 x double> @trunc_v4f64(<4 x double> %p)
+; CHECK-LABEL: trunc_v4f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: friz 1, 1
+; CHECK-NEXT: friz 2, 2
+; CHECK-NEXT: friz 3, 3
+; CHECK-NEXT: friz 4, 4
+; CHECK-NEXT: blr
{
%t = call <4 x double> @llvm.trunc.v4f64(<4 x double> %p)
ret <4 x double> %t
}
-; CHECK-LABEL: trunc_v4f64:
-; CHECK: friz
-; CHECK: friz
-; CHECK: friz
-; CHECK: friz
declare <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p)
-define <2 x double> @nearbyint_v2f64(<2 x double> %p)
+define <2 x double> @nearbyint_v2f64(<2 x double> %p) nounwind
+; CHECK-LABEL: nearbyint_v2f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: mflr 0
+; CHECK-NEXT: stdu 1, -128(1)
+; CHECK-NEXT: std 0, 144(1)
+; CHECK-NEXT: stfd 30, 112(1) # 8-byte Folded Spill
+; CHECK-NEXT: stfd 31, 120(1) # 8-byte Folded Spill
+; CHECK-NEXT: fmr 31, 2
+; CHECK-NEXT: bl nearbyint
+; CHECK-NEXT: nop
+; CHECK-NEXT: fmr 30, 1
+; CHECK-NEXT: fmr 1, 31
+; CHECK-NEXT: bl nearbyint
+; CHECK-NEXT: nop
+; CHECK-NEXT: fmr 2, 1
+; CHECK-NEXT: fmr 1, 30
+; CHECK-NEXT: lfd 31, 120(1) # 8-byte Folded Reload
+; CHECK-NEXT: lfd 30, 112(1) # 8-byte Folded Reload
+; CHECK-NEXT: addi 1, 1, 128
+; CHECK-NEXT: ld 0, 16(1)
+; CHECK-NEXT: mtlr 0
+; CHECK-NEXT: blr
{
%t = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p)
ret <2 x double> %t
}
-; CHECK-LABEL: nearbyint_v2f64:
-; CHECK: bl nearbyint
-; CHECK: bl nearbyint
declare <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p)
-define <4 x double> @nearbyint_v4f64(<4 x double> %p)
+define <4 x double> @nearbyint_v4f64(<4 x double> %p) nounwind
+; CHECK-LABEL: nearbyint_v4f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: mflr 0
+; CHECK-NEXT: stdu 1, -144(1)
+; CHECK-NEXT: std 0, 160(1)
+; CHECK-NEXT: stfd 28, 112(1) # 8-byte Folded Spill
+; CHECK-NEXT: stfd 29, 120(1) # 8-byte Folded Spill
+; CHECK-NEXT: fmr 29, 2
+; CHECK-NEXT: stfd 30, 128(1) # 8-byte Folded Spill
+; CHECK-NEXT: fmr 30, 3
+; CHECK-NEXT: stfd 31, 136(1) # 8-byte Folded Spill
+; CHECK-NEXT: fmr 31, 4
+; CHECK-NEXT: bl nearbyint
+; CHECK-NEXT: nop
+; CHECK-NEXT: fmr 28, 1
+; CHECK-NEXT: fmr 1, 29
+; CHECK-NEXT: bl nearbyint
+; CHECK-NEXT: nop
+; CHECK-NEXT: fmr 29, 1
+; CHECK-NEXT: fmr 1, 30
+; CHECK-NEXT: bl nearbyint
+; CHECK-NEXT: nop
+; CHECK-NEXT: fmr 30, 1
+; CHECK-NEXT: fmr 1, 31
+; CHECK-NEXT: bl nearbyint
+; CHECK-NEXT: nop
+; CHECK-NEXT: fmr 4, 1
+; CHECK-NEXT: fmr 1, 28
+; CHECK-NEXT: lfd 31, 136(1) # 8-byte Folded Reload
+; CHECK-NEXT: lfd 28, 112(1) # 8-byte Folded Reload
+; CHECK-NEXT: fmr 2, 29
+; CHECK-NEXT: fmr 3, 30
+; CHECK-NEXT: lfd 30, 128(1) # 8-byte Folded Reload
+; CHECK-NEXT: lfd 29, 120(1) # 8-byte Folded Reload
+; CHECK-NEXT: addi 1, 1, 144
+; CHECK-NEXT: ld 0, 16(1)
+; CHECK-NEXT: mtlr 0
+; CHECK-NEXT: blr
{
%t = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p)
ret <4 x double> %t
}
-; CHECK-LABEL: nearbyint_v4f64:
-; CHECK: bl nearbyint
-; CHECK: bl nearbyint
-; CHECK: bl nearbyint
-; CHECK: bl nearbyint
declare <4 x float> @llvm.floor.v4f32(<4 x float> %p)
define <4 x float> @floor_v4f32(<4 x float> %p)
+; CHECK-LABEL: floor_v4f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrfim 2, 2
+; CHECK-NEXT: blr
{
%t = call <4 x float> @llvm.floor.v4f32(<4 x float> %p)
ret <4 x float> %t
}
-; CHECK-LABEL: floor_v4f32:
-; CHECK: vrfim
declare <8 x float> @llvm.floor.v8f32(<8 x float> %p)
define <8 x float> @floor_v8f32(<8 x float> %p)
+; CHECK-LABEL: floor_v8f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrfim 2, 2
+; CHECK-NEXT: vrfim 3, 3
+; CHECK-NEXT: blr
{
%t = call <8 x float> @llvm.floor.v8f32(<8 x float> %p)
ret <8 x float> %t
}
-; CHECK-LABEL: floor_v8f32:
-; CHECK: vrfim
-; CHECK: vrfim
declare <4 x float> @llvm.ceil.v4f32(<4 x float> %p)
define <4 x float> @ceil_v4f32(<4 x float> %p)
+; CHECK-LABEL: ceil_v4f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrfip 2, 2
+; CHECK-NEXT: blr
{
%t = call <4 x float> @llvm.ceil.v4f32(<4 x float> %p)
ret <4 x float> %t
}
-; CHECK-LABEL: ceil_v4f32:
-; CHECK: vrfip
declare <8 x float> @llvm.ceil.v8f32(<8 x float> %p)
define <8 x float> @ceil_v8f32(<8 x float> %p)
+; CHECK-LABEL: ceil_v8f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrfip 2, 2
+; CHECK-NEXT: vrfip 3, 3
+; CHECK-NEXT: blr
{
%t = call <8 x float> @llvm.ceil.v8f32(<8 x float> %p)
ret <8 x float> %t
}
-; CHECK-LABEL: ceil_v8f32:
-; CHECK: vrfip
-; CHECK: vrfip
declare <4 x float> @llvm.trunc.v4f32(<4 x float> %p)
define <4 x float> @trunc_v4f32(<4 x float> %p)
+; CHECK-LABEL: trunc_v4f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrfiz 2, 2
+; CHECK-NEXT: blr
{
%t = call <4 x float> @llvm.trunc.v4f32(<4 x float> %p)
ret <4 x float> %t
}
-; CHECK-LABEL: trunc_v4f32:
-; CHECK: vrfiz
declare <8 x float> @llvm.trunc.v8f32(<8 x float> %p)
define <8 x float> @trunc_v8f32(<8 x float> %p)
+; CHECK-LABEL: trunc_v8f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrfiz 2, 2
+; CHECK-NEXT: vrfiz 3, 3
+; CHECK-NEXT: blr
{
%t = call <8 x float> @llvm.trunc.v8f32(<8 x float> %p)
ret <8 x float> %t
}
-; CHECK-LABEL: trunc_v8f32:
-; CHECK: vrfiz
-; CHECK: vrfiz
declare <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p)
define <4 x float> @nearbyint_v4f32(<4 x float> %p)
+; CHECK-LABEL: nearbyint_v4f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrfin 2, 2
+; CHECK-NEXT: blr
{
%t = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p)
ret <4 x float> %t
}
-; CHECK-LABEL: nearbyint_v4f32:
-; CHECK: vrfin
declare <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p)
define <8 x float> @nearbyint_v8f32(<8 x float> %p)
+; CHECK-LABEL: nearbyint_v8f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrfin 2, 2
+; CHECK-NEXT: vrfin 3, 3
+; CHECK-NEXT: blr
{
%t = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p)
ret <8 x float> %t
}
-; CHECK-LABEL: nearbyint_v8f32:
-; CHECK: vrfin
-; CHECK: vrfin
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index ea08061..769823d 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -75,6 +75,7 @@
; CHECK-NEXT: CodeGen Prepare
; CHECK-NEXT: Dominator Tree Construction
; CHECK-NEXT: Exception handling preparation
+; CHECK-NEXT: RISC-V Promote Constants
; CHECK-NEXT: A No-Op Barrier Pass
; CHECK-NEXT: FunctionPass Manager
; CHECK-NEXT: Merge internal globals
diff --git a/llvm/test/CodeGen/RISCV/cfi-multiple-locations.mir b/llvm/test/CodeGen/RISCV/cfi-multiple-locations.mir
new file mode 100644
index 0000000..7844589
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/cfi-multiple-locations.mir
@@ -0,0 +1,35 @@
+# RUN: llc %s -mtriple=riscv64 \
+# RUN: -run-pass=cfi-instr-inserter \
+# RUN: -riscv-enable-cfi-instr-inserter=true
+# XFAIL: *
+
+# Technically, it is possible that a callee-saved register is saved in multiple different locations.
+# CFIInstrInserter should handle this, but currently it does not.
+---
+name: multiple_locations
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x9, $x2
+ BEQ $x10, $x0, %bb.3
+ PseudoBR %bb.2
+
+ bb.1:
+ liveins: $x10, $x9, $x2
+ $x5 = COPY $x9
+ CFI_INSTRUCTION register $x9, $x5
+ $x9 = COPY $x5
+ CFI_INSTRUCTION register $x9, $x9
+ PseudoBR %bb.3
+
+ bb.2:
+ liveins: $x10, $x9, $x2
+ SD $x9, $x2, 0 :: (store (s64))
+ CFI_INSTRUCTION offset $x9, 0
+ $x9 = LD $x2, 0 :: (load (s64))
+ CFI_INSTRUCTION register $x9, $x9
+ PseudoBR %bb.3
+
+ bb.3:
+ PseudoRET
+...
diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll
index 988d049..cf44af6 100644
--- a/llvm/test/CodeGen/RISCV/features-info.ll
+++ b/llvm/test/CodeGen/RISCV/features-info.ll
@@ -137,6 +137,7 @@
; CHECK-NEXT: shifted-zextw-fusion - Enable SLLI+SRLI to be fused when computing (shifted) word zero extension.
; CHECK-NEXT: shlcofideleg - 'Shlcofideleg' (Delegating LCOFI Interrupts to VS-mode).
; CHECK-NEXT: short-forward-branch-i-minmax - Enable short forward branch optimization for min,max instructions in Zbb.
+; CHECK-NEXT: short-forward-branch-i-mul - Enable short forward branch optimization for mul instruction.
; CHECK-NEXT: short-forward-branch-opt - Enable short forward branch optimization.
; CHECK-NEXT: shtvala - 'Shtvala' (htval provides all needed values).
; CHECK-NEXT: shvsatpa - 'Shvsatpa' (vsatp supports all modes supported by satp).
diff --git a/llvm/test/CodeGen/RISCV/mask-variable-shift.ll b/llvm/test/CodeGen/RISCV/mask-variable-shift.ll
new file mode 100644
index 0000000..4e73cee
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/mask-variable-shift.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=RV32
+; RUN: llc -mtriple=riscv64-none-elf -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=RV64
+
+define i32 @mask_pair(i32 %x, i32 %y) {
+; RV32-LABEL: mask_pair:
+; RV32: # %bb.0:
+; RV32-NEXT: srl a0, a0, a1
+; RV32-NEXT: sll a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mask_pair:
+; RV64: # %bb.0:
+; RV64-NEXT: srlw a0, a0, a1
+; RV64-NEXT: sllw a0, a0, a1
+; RV64-NEXT: ret
+ %shl = shl nsw i32 -1, %y
+ %and = and i32 %shl, %x
+ ret i32 %and
+}
+
+define i64 @mask_pair_64(i64 %x, i64 %y) {
+; RV32-LABEL: mask_pair_64:
+; RV32: # %bb.0:
+; RV32-NEXT: li a3, -1
+; RV32-NEXT: addi a4, a2, -32
+; RV32-NEXT: sll a3, a3, a2
+; RV32-NEXT: bltz a4, .LBB1_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: mv a2, a3
+; RV32-NEXT: j .LBB1_3
+; RV32-NEXT: .LBB1_2:
+; RV32-NEXT: not a2, a2
+; RV32-NEXT: lui a5, 524288
+; RV32-NEXT: addi a5, a5, -1
+; RV32-NEXT: srl a2, a5, a2
+; RV32-NEXT: or a2, a3, a2
+; RV32-NEXT: .LBB1_3:
+; RV32-NEXT: srai a4, a4, 31
+; RV32-NEXT: and a3, a4, a3
+; RV32-NEXT: and a1, a2, a1
+; RV32-NEXT: and a0, a3, a0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mask_pair_64:
+; RV64: # %bb.0:
+; RV64-NEXT: srl a0, a0, a1
+; RV64-NEXT: sll a0, a0, a1
+; RV64-NEXT: ret
+ %shl = shl nsw i64 -1, %y
+ %and = and i64 %shl, %x
+ ret i64 %and
+}
+
+define i128 @mask_pair_128(i128 %x, i128 %y) {
+; RV32-LABEL: mask_pair_128:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: .cfi_def_cfa_offset 32
+; RV32-NEXT: lw a5, 0(a1)
+; RV32-NEXT: lw a4, 4(a1)
+; RV32-NEXT: lw a3, 8(a1)
+; RV32-NEXT: lw a1, 12(a1)
+; RV32-NEXT: lw a2, 0(a2)
+; RV32-NEXT: li a6, -1
+; RV32-NEXT: sw zero, 0(sp)
+; RV32-NEXT: sw zero, 4(sp)
+; RV32-NEXT: sw zero, 8(sp)
+; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: addi a7, sp, 16
+; RV32-NEXT: sw a6, 16(sp)
+; RV32-NEXT: sw a6, 20(sp)
+; RV32-NEXT: sw a6, 24(sp)
+; RV32-NEXT: sw a6, 28(sp)
+; RV32-NEXT: srli a6, a2, 3
+; RV32-NEXT: andi a6, a6, 12
+; RV32-NEXT: sub a6, a7, a6
+; RV32-NEXT: lw a7, 4(a6)
+; RV32-NEXT: lw t0, 8(a6)
+; RV32-NEXT: lw t1, 12(a6)
+; RV32-NEXT: lw a6, 0(a6)
+; RV32-NEXT: andi t2, a2, 31
+; RV32-NEXT: xori t2, t2, 31
+; RV32-NEXT: sll t1, t1, a2
+; RV32-NEXT: srli t3, t0, 1
+; RV32-NEXT: sll t0, t0, a2
+; RV32-NEXT: srli t4, a7, 1
+; RV32-NEXT: sll a7, a7, a2
+; RV32-NEXT: sll a2, a6, a2
+; RV32-NEXT: srli a6, a6, 1
+; RV32-NEXT: srl t3, t3, t2
+; RV32-NEXT: srl t4, t4, t2
+; RV32-NEXT: srl a6, a6, t2
+; RV32-NEXT: and a2, a2, a5
+; RV32-NEXT: or a5, t1, t3
+; RV32-NEXT: or t0, t0, t4
+; RV32-NEXT: or a6, a7, a6
+; RV32-NEXT: and a4, a6, a4
+; RV32-NEXT: and a3, t0, a3
+; RV32-NEXT: and a1, a5, a1
+; RV32-NEXT: sw a2, 0(a0)
+; RV32-NEXT: sw a4, 4(a0)
+; RV32-NEXT: sw a3, 8(a0)
+; RV32-NEXT: sw a1, 12(a0)
+; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mask_pair_128:
+; RV64: # %bb.0:
+; RV64-NEXT: li a5, -1
+; RV64-NEXT: addi a4, a2, -64
+; RV64-NEXT: sll a3, a5, a2
+; RV64-NEXT: bltz a4, .LBB2_2
+; RV64-NEXT: # %bb.1:
+; RV64-NEXT: mv a2, a3
+; RV64-NEXT: j .LBB2_3
+; RV64-NEXT: .LBB2_2:
+; RV64-NEXT: not a2, a2
+; RV64-NEXT: srli a5, a5, 1
+; RV64-NEXT: srl a2, a5, a2
+; RV64-NEXT: or a2, a3, a2
+; RV64-NEXT: .LBB2_3:
+; RV64-NEXT: srai a4, a4, 63
+; RV64-NEXT: and a3, a4, a3
+; RV64-NEXT: and a1, a2, a1
+; RV64-NEXT: and a0, a3, a0
+; RV64-NEXT: ret
+ %shl = shl nsw i128 -1, %y
+ %and = and i128 %shl, %x
+ ret i128 %and
+}
diff --git a/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll b/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll
index c489bc3..aa63552 100644
--- a/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll
+++ b/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll
@@ -488,5 +488,5 @@ declare <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double>)
declare <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float>)
;.
; CHECK: attributes #[[ATTR0]] = { "target-features"="+v" }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-features"="+v" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) "target-features"="+v" }
;.
diff --git a/llvm/test/CodeGen/RISCV/riscv-promote-constant.ll b/llvm/test/CodeGen/RISCV/riscv-promote-constant.ll
new file mode 100644
index 0000000..2bde601
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/riscv-promote-constant.ll
@@ -0,0 +1,148 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt %s -S -riscv-promote-const -mtriple=riscv64 -mattr=+d | FileCheck %s
+
+; No promotion should take place, as the pass skips floats.
+define float @multiple_floats(float %a, float %b) {
+; CHECK-LABEL: define float @multiple_floats(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[ADD1:%.*]] = fadd float [[A]], 1.000000e+00
+; CHECK-NEXT: [[ADD2:%.*]] = fadd float [[B]], 2.000000e+00
+; CHECK-NEXT: [[SUM_F:%.*]] = fadd float [[ADD1]], [[ADD2]]
+; CHECK-NEXT: ret float [[SUM_F]]
+;
+entry:
+ %add1 = fadd float %a, 1.0
+ %add2 = fadd float %b, 2.0
+ %sum_f = fadd float %add1, %add2
+ ret float %sum_f
+}
+
+; No promotion should take place as cases with a single constant are skipped.
+define double @single_double(double %a) {
+; CHECK-LABEL: define double @single_double(
+; CHECK-SAME: double [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[ADD:%.*]] = fadd double [[A]], 4.210000e+01
+; CHECK-NEXT: ret double [[ADD]]
+;
+entry:
+ %add = fadd double %a, 42.1
+ ret double %add
+}
+
+; Promotion should happen as we have at least two unique constants that would
+; otherwise go in the constant pool.
+define double @multiple_doubles(double %a, double %b) {
+; CHECK-LABEL: define double @multiple_doubles(
+; CHECK-SAME: double [[A:%.*]], double [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[DOUBLE_VAL1:%.*]] = load double, ptr getelementptr inbounds ([2 x double], ptr @.promoted_doubles.multiple_doubles, i64 0, i64 1), align 8
+; CHECK-NEXT: [[ADD3:%.*]] = load double, ptr @.promoted_doubles.multiple_doubles, align 8
+; CHECK-NEXT: [[ADD2:%.*]] = fadd double [[A]], [[ADD3]]
+; CHECK-NEXT: [[ADD4:%.*]] = fadd double [[B]], [[DOUBLE_VAL1]]
+; CHECK-NEXT: [[SUM:%.*]] = fadd double [[ADD2]], [[ADD3]]
+; CHECK-NEXT: [[SUM1:%.*]] = fadd double [[ADD4]], [[SUM]]
+; CHECK-NEXT: ret double [[SUM1]]
+;
+entry:
+ %add1 = fadd double %a, 2.718
+ %add2 = fadd double %b, 42.1
+ %add3 = fadd double %add1, 2.718
+ %sum = fadd double %add2, %add3
+ ret double %sum
+}
+
+; Promotion should not happen as the constants will be materialised rather
+; than using the constant pool.
+define double @multiple_doubles_no_promote(double %a, double %b) {
+; CHECK-LABEL: define double @multiple_doubles_no_promote(
+; CHECK-SAME: double [[A:%.*]], double [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[ADD1:%.*]] = fadd double [[A]], 1.000000e+00
+; CHECK-NEXT: [[ADD2:%.*]] = fadd double [[B]], 2.000000e+00
+; CHECK-NEXT: [[ADD3:%.*]] = fadd double [[ADD1]], 1.000000e+00
+; CHECK-NEXT: [[SUM:%.*]] = fadd double [[ADD2]], [[ADD3]]
+; CHECK-NEXT: ret double [[SUM]]
+;
+entry:
+ %add1 = fadd double %a, 1.0
+ %add2 = fadd double %b, 2.0
+ %add3 = fadd double %add1, 1.0
+ %sum = fadd double %add2, %add3
+ ret double %sum
+}
+
+; The same constant shouldn't be loaded more than once per BB.
+define double @multiple_doubles_multi_bb(double %a, i1 %cond) {
+; CHECK-LABEL: define double @multiple_doubles_multi_bb(
+; CHECK-SAME: double [[A:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br i1 [[COND]], label %[[IF_TRUE:.*]], label %[[IF_FALSE:.*]]
+; CHECK: [[IF_TRUE]]:
+; CHECK-NEXT: [[DOUBLE_VAL2:%.*]] = load double, ptr getelementptr inbounds ([2 x double], ptr @.promoted_doubles.multiple_doubles_multi_bb, i64 0, i64 1), align 8
+; CHECK-NEXT: [[DOUBLE_VAL:%.*]] = load double, ptr @.promoted_doubles.multiple_doubles_multi_bb, align 8
+; CHECK-NEXT: [[ADD_T:%.*]] = fadd double [[A]], [[DOUBLE_VAL]]
+; CHECK-NEXT: [[MUL_T:%.*]] = fmul double [[ADD_T]], [[DOUBLE_VAL2]]
+; CHECK-NEXT: [[SUB_T:%.*]] = fsub double [[MUL_T]], [[DOUBLE_VAL]]
+; CHECK-NEXT: br label %[[IF_END:.*]]
+; CHECK: [[IF_FALSE]]:
+; CHECK-NEXT: [[DOUBLE_VAL3:%.*]] = load double, ptr getelementptr inbounds ([2 x double], ptr @.promoted_doubles.multiple_doubles_multi_bb, i64 0, i64 1), align 8
+; CHECK-NEXT: [[DOUBLE_VAL1:%.*]] = load double, ptr @.promoted_doubles.multiple_doubles_multi_bb, align 8
+; CHECK-NEXT: [[ADD_F:%.*]] = fadd double [[A]], [[DOUBLE_VAL1]]
+; CHECK-NEXT: [[MUL_F:%.*]] = fmul double [[ADD_F]], [[DOUBLE_VAL3]]
+; CHECK-NEXT: [[SUB_F:%.*]] = fsub double [[MUL_F]], [[DOUBLE_VAL1]]
+; CHECK-NEXT: br label %[[IF_END]]
+; CHECK: [[IF_END]]:
+; CHECK-NEXT: [[PHI_RES:%.*]] = phi double [ [[SUB_T]], %[[IF_TRUE]] ], [ [[SUB_F]], %[[IF_FALSE]] ]
+; CHECK-NEXT: ret double [[PHI_RES]]
+;
+entry:
+ br i1 %cond, label %if.true, label %if.false
+
+if.true:
+ %add.t = fadd double %a, 1.23
+ %mul.t = fmul double %add.t, 4.56
+ %sub.t = fsub double %mul.t, 1.23
+ br label %if.end
+
+if.false:
+ %add.f = fadd double %a, 1.23
+ %mul.f = fmul double %add.f, 4.56
+ %sub.f = fsub double %mul.f, 1.23
+ br label %if.end
+
+if.end:
+ %phi.res = phi double [ %sub.t, %if.true ], [ %sub.f, %if.false ]
+ ret double %phi.res
+}
+
+; Check the insertion point in the case we have a phi taking a constant C and
+; the source block also uses that same constant.
+define double @multiple_doubles_phi(double %a, i1 %cond) {
+; CHECK-LABEL: define double @multiple_doubles_phi(
+; CHECK-SAME: double [[A:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br i1 [[COND]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: [[DOUBLE_VAL:%.*]] = load double, ptr @.promoted_doubles.multiple_doubles_phi, align 8
+; CHECK-NEXT: [[MUL:%.*]] = fmul double [[A]], [[DOUBLE_VAL]]
+; CHECK-NEXT: br label %[[IF_END]]
+; CHECK: [[IF_END]]:
+; CHECK-NEXT: [[PHI_VAL:%.*]] = phi double [ [[DOUBLE_VAL]], %[[IF_THEN]] ], [ [[A]], %[[ENTRY]] ]
+; CHECK-NEXT: [[DOUBLE_VAL1:%.*]] = load double, ptr getelementptr inbounds ([2 x double], ptr @.promoted_doubles.multiple_doubles_phi, i64 0, i64 1), align 8
+; CHECK-NEXT: [[RES:%.*]] = fadd double [[PHI_VAL]], [[DOUBLE_VAL1]]
+; CHECK-NEXT: ret double [[RES]]
+;
+entry:
+ br i1 %cond, label %if.then, label %if.end
+
+if.then:
+ %mul = fmul double %a, 1.23
+ br label %if.end
+
+if.end:
+ %phi.val = phi double [ 1.23, %if.then ], [ %a, %entry ]
+ %res = fadd double %phi.val, 4.56
+ ret double %res
+}
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-mul.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-mul.ll
new file mode 100644
index 0000000..3f780fd
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-mul.ll
@@ -0,0 +1,156 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m | FileCheck %s --check-prefixes=RV32I-M
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m | FileCheck %s --check-prefixes=RV64I-M
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+short-forward-branch-opt | \
+; RUN: FileCheck %s --check-prefixes=RV32I-SFB-M
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+short-forward-branch-opt | \
+; RUN: FileCheck %s --check-prefixes=RV64I-SFB-M
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+short-forward-branch-i-mul | \
+; RUN: FileCheck %s --check-prefixes=RV32I-SFBIMul-M
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+short-forward-branch-i-mul | \
+; RUN: FileCheck %s --check-prefixes=RV64I-SFBIMul-M
+
+define i32 @select_example_mul_i32(i32 %a, i32 %b, i1 zeroext %x, i32 %y) {
+; RV32I-M-LABEL: select_example_mul_i32:
+; RV32I-M: # %bb.0: # %entry
+; RV32I-M-NEXT: beqz a2, .LBB0_2
+; RV32I-M-NEXT: # %bb.1:
+; RV32I-M-NEXT: mul a1, a0, a3
+; RV32I-M-NEXT: .LBB0_2: # %entry
+; RV32I-M-NEXT: mv a0, a1
+; RV32I-M-NEXT: ret
+;
+; RV64I-M-LABEL: select_example_mul_i32:
+; RV64I-M: # %bb.0: # %entry
+; RV64I-M-NEXT: beqz a2, .LBB0_2
+; RV64I-M-NEXT: # %bb.1:
+; RV64I-M-NEXT: mulw a1, a0, a3
+; RV64I-M-NEXT: .LBB0_2: # %entry
+; RV64I-M-NEXT: mv a0, a1
+; RV64I-M-NEXT: ret
+;
+; RV32I-SFB-M-LABEL: select_example_mul_i32:
+; RV32I-SFB-M: # %bb.0: # %entry
+; RV32I-SFB-M-NEXT: mul a0, a0, a3
+; RV32I-SFB-M-NEXT: bnez a2, .LBB0_2
+; RV32I-SFB-M-NEXT: # %bb.1: # %entry
+; RV32I-SFB-M-NEXT: mv a0, a1
+; RV32I-SFB-M-NEXT: .LBB0_2: # %entry
+; RV32I-SFB-M-NEXT: ret
+;
+; RV64I-SFB-M-LABEL: select_example_mul_i32:
+; RV64I-SFB-M: # %bb.0: # %entry
+; RV64I-SFB-M-NEXT: mulw a0, a0, a3
+; RV64I-SFB-M-NEXT: bnez a2, .LBB0_2
+; RV64I-SFB-M-NEXT: # %bb.1: # %entry
+; RV64I-SFB-M-NEXT: mv a0, a1
+; RV64I-SFB-M-NEXT: .LBB0_2: # %entry
+; RV64I-SFB-M-NEXT: ret
+;
+; RV32I-SFBIMul-M-LABEL: select_example_mul_i32:
+; RV32I-SFBIMul-M: # %bb.0: # %entry
+; RV32I-SFBIMul-M-NEXT: beqz a2, .LBB0_2
+; RV32I-SFBIMul-M-NEXT: # %bb.1: # %entry
+; RV32I-SFBIMul-M-NEXT: mul a1, a0, a3
+; RV32I-SFBIMul-M-NEXT: .LBB0_2: # %entry
+; RV32I-SFBIMul-M-NEXT: mv a0, a1
+; RV32I-SFBIMul-M-NEXT: ret
+;
+; RV64I-SFBIMul-M-LABEL: select_example_mul_i32:
+; RV64I-SFBIMul-M: # %bb.0: # %entry
+; RV64I-SFBIMul-M-NEXT: mulw a0, a0, a3
+; RV64I-SFBIMul-M-NEXT: bnez a2, .LBB0_2
+; RV64I-SFBIMul-M-NEXT: # %bb.1: # %entry
+; RV64I-SFBIMul-M-NEXT: mv a0, a1
+; RV64I-SFBIMul-M-NEXT: .LBB0_2: # %entry
+; RV64I-SFBIMul-M-NEXT: ret
+entry:
+ %res = mul i32 %a, %y
+ %sel = select i1 %x, i32 %res, i32 %b
+ ret i32 %sel
+}
+
+define i64 @select_example_mul_i64(i64 %a, i64 %b, i1 zeroext %x, i64 %y) {
+; RV32I-M-LABEL: select_example_mul_i64:
+; RV32I-M: # %bb.0: # %entry
+; RV32I-M-NEXT: beqz a4, .LBB1_2
+; RV32I-M-NEXT: # %bb.1:
+; RV32I-M-NEXT: mul a2, a0, a6
+; RV32I-M-NEXT: mulhu a3, a0, a5
+; RV32I-M-NEXT: mul a1, a1, a5
+; RV32I-M-NEXT: add a2, a3, a2
+; RV32I-M-NEXT: add a3, a2, a1
+; RV32I-M-NEXT: mul a2, a0, a5
+; RV32I-M-NEXT: .LBB1_2: # %entry
+; RV32I-M-NEXT: mv a0, a2
+; RV32I-M-NEXT: mv a1, a3
+; RV32I-M-NEXT: ret
+;
+; RV64I-M-LABEL: select_example_mul_i64:
+; RV64I-M: # %bb.0: # %entry
+; RV64I-M-NEXT: beqz a2, .LBB1_2
+; RV64I-M-NEXT: # %bb.1:
+; RV64I-M-NEXT: mul a1, a0, a3
+; RV64I-M-NEXT: .LBB1_2: # %entry
+; RV64I-M-NEXT: mv a0, a1
+; RV64I-M-NEXT: ret
+;
+; RV32I-SFB-M-LABEL: select_example_mul_i64:
+; RV32I-SFB-M: # %bb.0: # %entry
+; RV32I-SFB-M-NEXT: mul a6, a0, a6
+; RV32I-SFB-M-NEXT: mulhu a7, a0, a5
+; RV32I-SFB-M-NEXT: mul a1, a1, a5
+; RV32I-SFB-M-NEXT: mul a0, a0, a5
+; RV32I-SFB-M-NEXT: add a6, a7, a6
+; RV32I-SFB-M-NEXT: beqz a4, .LBB1_2
+; RV32I-SFB-M-NEXT: # %bb.1: # %entry
+; RV32I-SFB-M-NEXT: add a3, a6, a1
+; RV32I-SFB-M-NEXT: .LBB1_2: # %entry
+; RV32I-SFB-M-NEXT: bnez a4, .LBB1_4
+; RV32I-SFB-M-NEXT: # %bb.3: # %entry
+; RV32I-SFB-M-NEXT: mv a0, a2
+; RV32I-SFB-M-NEXT: .LBB1_4: # %entry
+; RV32I-SFB-M-NEXT: mv a1, a3
+; RV32I-SFB-M-NEXT: ret
+;
+; RV64I-SFB-M-LABEL: select_example_mul_i64:
+; RV64I-SFB-M: # %bb.0: # %entry
+; RV64I-SFB-M-NEXT: mul a0, a0, a3
+; RV64I-SFB-M-NEXT: bnez a2, .LBB1_2
+; RV64I-SFB-M-NEXT: # %bb.1: # %entry
+; RV64I-SFB-M-NEXT: mv a0, a1
+; RV64I-SFB-M-NEXT: .LBB1_2: # %entry
+; RV64I-SFB-M-NEXT: ret
+;
+; RV32I-SFBIMul-M-LABEL: select_example_mul_i64:
+; RV32I-SFBIMul-M: # %bb.0: # %entry
+; RV32I-SFBIMul-M-NEXT: mul a6, a0, a6
+; RV32I-SFBIMul-M-NEXT: mulhu a7, a0, a5
+; RV32I-SFBIMul-M-NEXT: mul a1, a1, a5
+; RV32I-SFBIMul-M-NEXT: add a6, a7, a6
+; RV32I-SFBIMul-M-NEXT: beqz a4, .LBB1_2
+; RV32I-SFBIMul-M-NEXT: # %bb.1: # %entry
+; RV32I-SFBIMul-M-NEXT: add a3, a6, a1
+; RV32I-SFBIMul-M-NEXT: .LBB1_2: # %entry
+; RV32I-SFBIMul-M-NEXT: beqz a4, .LBB1_4
+; RV32I-SFBIMul-M-NEXT: # %bb.3: # %entry
+; RV32I-SFBIMul-M-NEXT: mul a2, a0, a5
+; RV32I-SFBIMul-M-NEXT: .LBB1_4: # %entry
+; RV32I-SFBIMul-M-NEXT: mv a0, a2
+; RV32I-SFBIMul-M-NEXT: mv a1, a3
+; RV32I-SFBIMul-M-NEXT: ret
+;
+; RV64I-SFBIMul-M-LABEL: select_example_mul_i64:
+; RV64I-SFBIMul-M: # %bb.0: # %entry
+; RV64I-SFBIMul-M-NEXT: beqz a2, .LBB1_2
+; RV64I-SFBIMul-M-NEXT: # %bb.1: # %entry
+; RV64I-SFBIMul-M-NEXT: mul a1, a0, a3
+; RV64I-SFBIMul-M-NEXT: .LBB1_2: # %entry
+; RV64I-SFBIMul-M-NEXT: mv a0, a1
+; RV64I-SFBIMul-M-NEXT: ret
+entry:
+ %res = mul i64 %a, %y
+ %sel = select i1 %x, i64 %res, i64 %b
+ ret i64 %sel
+}
+
diff --git a/llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll b/llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll
index ec4884f..3e0d0cc 100644
--- a/llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll
+++ b/llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll
@@ -1,7 +1,9 @@
; RUN: llc --verify-machineinstrs --spv-emit-nonsemantic-debug-info --spirv-ext=+SPV_KHR_non_semantic_info --print-after=spirv-nonsemantic-debug-info -O0 -mtriple=spirv64-unknown-unknown %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-MIR
+; RUN: llc --verify-machineinstrs --print-after=spirv-nonsemantic-debug-info -O0 -mtriple=spirv64-amd-amdhsa %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-MIR
; RUN: llc --verify-machineinstrs --spv-emit-nonsemantic-debug-info --spirv-ext=+SPV_KHR_non_semantic_info -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: llc --verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
; RUN: llc --verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_non_semantic_info %s -o - | FileCheck %s --check-prefix=CHECK-OPTION
-; TODO(#109287): When type is void * the spirv-val raises an error when DebugInfoNone is set as <id> Base Type argument of DebugTypePointer.
+; TODO(#109287): When type is void * the spirv-val raises an error when DebugInfoNone is set as <id> Base Type argument of DebugTypePointer.
; DISABLED: %if spirv-tools %{ llc --verify-machineinstrs --spv-emit-nonsemantic-debug-info --spirv-ext=+SPV_KHR_non_semantic_info -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
; CHECK-MIR-DAG: [[i32type:%[0-9]+\:type]] = OpTypeInt 32, 0
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll
index b1a555a..6b4e35e 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll
@@ -7,6 +7,8 @@
; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_EXT_optnone,+SPV_INTEL_optnone %s -o - | FileCheck %s --check-prefixes=CHECK-TWO-EXTENSIONS
; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=all %s -o - | FileCheck %s --check-prefixes=CHECK-ALL-EXTENSIONS
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s --check-prefixes=CHECK-ALL-EXTENSIONS
+
; CHECK-EXTENSION: OpCapability OptNoneEXT
; CHECK-EXTENSION: OpExtension "SPV_EXT_optnone"
; CHECK-NO-EXTENSION-NOT: OpCapability OptNoneINTEL
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-arithmetic.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-arithmetic.ll
new file mode 100644
index 0000000..4cabddb
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-arithmetic.ll
@@ -0,0 +1,142 @@
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_bfloat16 %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_arithmetic,+SPV_KHR_bfloat16 %s -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_arithmetic,+SPV_KHR_bfloat16 %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-ERROR: LLVM ERROR: Arithmetic instructions with bfloat16 arguments require the following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic
+
+; CHECK-DAG: OpCapability BFloat16TypeKHR
+; CHECK-DAG: OpCapability BFloat16ArithmeticINTEL
+; CHECK-DAG: OpExtension "SPV_KHR_bfloat16"
+; CHECK-DAG: OpExtension "SPV_INTEL_bfloat16_arithmetic"
+; CHECK-DAG: OpName [[NEG:%.*]] "neg"
+; CHECK-DAG: OpName [[NEGV:%.*]] "negv"
+; CHECK-DAG: OpName [[ADD:%.*]] "add"
+; CHECK-DAG: OpName [[ADDV:%.*]] "addv"
+; CHECK-DAG: OpName [[SUB:%.*]] "sub"
+; CHECK-DAG: OpName [[SUBV:%.*]] "subv"
+; CHECK-DAG: OpName [[MUL:%.*]] "mul"
+; CHECK-DAG: OpName [[MULV:%.*]] "mulv"
+; CHECK-DAG: OpName [[DIV:%.*]] "div"
+; CHECK-DAG: OpName [[DIVV:%.*]] "divv"
+; CHECK-DAG: OpName [[REM:%.*]] "rem"
+; CHECK-DAG: OpName [[REMV:%.*]] "remv"
+; CHECK: [[BFLOAT:%.*]] = OpTypeFloat 16 0
+; CHECK: [[BFLOATV:%.*]] = OpTypeVector [[BFLOAT]] 4
+
+; CHECK-DAG: [[NEG]] = OpFunction [[BFLOAT]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-DAG: [[R:%.*]] = OpFNegate [[BFLOAT]] [[X]]
+define spir_func bfloat @neg(bfloat %x) {
+entry:
+ %r = fneg bfloat %x
+ ret bfloat %r
+}
+
+; CHECK-DAG: [[NEGV]] = OpFunction [[BFLOATV]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-DAG: [[R:%.*]] = OpFNegate [[BFLOATV]] [[X]]
+define spir_func <4 x bfloat> @negv(<4 x bfloat> %x) {
+entry:
+ %r = fneg <4 x bfloat> %x
+ ret <4 x bfloat> %r
+}
+
+; CHECK-DAG: [[ADD]] = OpFunction [[BFLOAT]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-DAG: [[R:%.*]] = OpFAdd [[BFLOAT]] [[X]] [[Y]]
+define spir_func bfloat @add(bfloat %x, bfloat %y) {
+entry:
+ %r = fadd bfloat %x, %y
+ ret bfloat %r
+}
+
+; CHECK-DAG: [[ADDV]] = OpFunction [[BFLOATV]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-DAG: [[R:%.*]] = OpFAdd [[BFLOATV]] [[X]] [[Y]]
+define spir_func <4 x bfloat> @addv(<4 x bfloat> %x, <4 x bfloat> %y) {
+entry:
+ %r = fadd <4 x bfloat> %x, %y
+ ret <4 x bfloat> %r
+}
+
+; CHECK-DAG: [[SUB]] = OpFunction [[BFLOAT]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-DAG: [[R:%.*]] = OpFSub [[BFLOAT]] [[X]] [[Y]]
+define spir_func bfloat @sub(bfloat %x, bfloat %y) {
+entry:
+ %r = fsub bfloat %x, %y
+ ret bfloat %r
+}
+
+; CHECK-DAG: [[SUBV]] = OpFunction [[BFLOATV]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-DAG: [[R:%.*]] = OpFSub [[BFLOATV]] [[X]] [[Y]]
+define spir_func <4 x bfloat> @subv(<4 x bfloat> %x, <4 x bfloat> %y) {
+entry:
+ %r = fsub <4 x bfloat> %x, %y
+ ret <4 x bfloat> %r
+}
+
+; CHECK-DAG: [[MUL]] = OpFunction [[BFLOAT]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-DAG: [[R:%.*]] = OpFMul [[BFLOAT]] [[X]] [[Y]]
+define spir_func bfloat @mul(bfloat %x, bfloat %y) {
+entry:
+ %r = fmul bfloat %x, %y
+ ret bfloat %r
+}
+
+; CHECK-DAG: [[MULV]] = OpFunction [[BFLOATV]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-DAG: [[R:%.*]] = OpFMul [[BFLOATV]] [[X]] [[Y]]
+define spir_func <4 x bfloat> @mulv(<4 x bfloat> %x, <4 x bfloat> %y) {
+entry:
+ %r = fmul <4 x bfloat> %x, %y
+ ret <4 x bfloat> %r
+}
+
+; CHECK-DAG: [[DIV]] = OpFunction [[BFLOAT]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-DAG: [[R:%.*]] = OpFDiv [[BFLOAT]] [[X]] [[Y]]
+define spir_func bfloat @div(bfloat %x, bfloat %y) {
+entry:
+ %r = fdiv bfloat %x, %y
+ ret bfloat %r
+}
+
+; CHECK-DAG: [[DIVV]] = OpFunction [[BFLOATV]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-DAG: [[R:%.*]] = OpFDiv [[BFLOATV]] [[X]] [[Y]]
+define spir_func <4 x bfloat> @divv(<4 x bfloat> %x, <4 x bfloat> %y) {
+entry:
+ %r = fdiv <4 x bfloat> %x, %y
+ ret <4 x bfloat> %r
+}
+
+; CHECK-DAG: [[REM]] = OpFunction [[BFLOAT]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-DAG: [[R:%.*]] = OpFRem [[BFLOAT]] [[X]] [[Y]]
+define spir_func bfloat @rem(bfloat %x, bfloat %y) {
+entry:
+ %r = frem bfloat %x, %y
+ ret bfloat %r
+}
+
+; CHECK-DAG: [[REMV]] = OpFunction [[BFLOATV]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-DAG: [[R:%.*]] = OpFRem [[BFLOATV]] [[X]] [[Y]]
+define spir_func <4 x bfloat> @remv(<4 x bfloat> %x, <4 x bfloat> %y) {
+entry:
+ %r = frem <4 x bfloat> %x, %y
+ ret <4 x bfloat> %r
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-relational.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-relational.ll
new file mode 100644
index 0000000..3774791
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-relational.ll
@@ -0,0 +1,376 @@
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_bfloat16 %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_arithmetic,+SPV_KHR_bfloat16 %s -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_arithmetic,+SPV_KHR_bfloat16 %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-ERROR: LLVM ERROR: Relational instructions with bfloat16 arguments require the following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic
+
+; CHECK-DAG: OpCapability BFloat16TypeKHR
+; CHECK-DAG: OpCapability BFloat16ArithmeticINTEL
+; CHECK-DAG: OpExtension "SPV_KHR_bfloat16"
+; CHECK-DAG: OpExtension "SPV_INTEL_bfloat16_arithmetic"
+; CHECK-DAG: OpName [[UEQ:%.*]] "test_ueq"
+; CHECK-DAG: OpName [[OEQ:%.*]] "test_oeq"
+; CHECK-DAG: OpName [[UNE:%.*]] "test_une"
+; CHECK-DAG: OpName [[ONE:%.*]] "test_one"
+; CHECK-DAG: OpName [[ULT:%.*]] "test_ult"
+; CHECK-DAG: OpName [[OLT:%.*]] "test_olt"
+; CHECK-DAG: OpName [[ULE:%.*]] "test_ule"
+; CHECK-DAG: OpName [[OLE:%.*]] "test_ole"
+; CHECK-DAG: OpName [[UGT:%.*]] "test_ugt"
+; CHECK-DAG: OpName [[OGT:%.*]] "test_ogt"
+; CHECK-DAG: OpName [[UGE:%.*]] "test_uge"
+; CHECK-DAG: OpName [[OGE:%.*]] "test_oge"
+; CHECK-DAG: OpName [[UNO:%.*]] "test_uno"
+; CHECK-DAG: OpName [[ORD:%.*]] "test_ord"
+; CHECK-DAG: OpName [[v3UEQ:%.*]] "test_v3_ueq"
+; CHECK-DAG: OpName [[v3OEQ:%.*]] "test_v3_oeq"
+; CHECK-DAG: OpName [[v3UNE:%.*]] "test_v3_une"
+; CHECK-DAG: OpName [[v3ONE:%.*]] "test_v3_one"
+; CHECK-DAG: OpName [[v3ULT:%.*]] "test_v3_ult"
+; CHECK-DAG: OpName [[v3OLT:%.*]] "test_v3_olt"
+; CHECK-DAG: OpName [[v3ULE:%.*]] "test_v3_ule"
+; CHECK-DAG: OpName [[v3OLE:%.*]] "test_v3_ole"
+; CHECK-DAG: OpName [[v3UGT:%.*]] "test_v3_ugt"
+; CHECK-DAG: OpName [[v3OGT:%.*]] "test_v3_ogt"
+; CHECK-DAG: OpName [[v3UGE:%.*]] "test_v3_uge"
+; CHECK-DAG: OpName [[v3OGE:%.*]] "test_v3_oge"
+; CHECK-DAG: OpName [[v3UNO:%.*]] "test_v3_uno"
+; CHECK-DAG: OpName [[v3ORD:%.*]] "test_v3_ord"
+; CHECK: [[BFLOAT:%.*]] = OpTypeFloat 16 0
+; CHECK: [[BFLOATV:%.*]] = OpTypeVector [[BFLOAT]] 3
+
+; CHECK: [[UEQ]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_ueq(bfloat %a, bfloat %b) {
+ %r = fcmp ueq bfloat %a, %b
+ ret i1 %r
+}
+
+; CHECK: [[OEQ]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_oeq(bfloat %a, bfloat %b) {
+ %r = fcmp oeq bfloat %a, %b
+ ret i1 %r
+}
+
+; CHECK: [[UNE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordNotEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_une(bfloat %a, bfloat %b) {
+ %r = fcmp une bfloat %a, %b
+ ret i1 %r
+}
+
+; CHECK: [[ONE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdNotEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_one(bfloat %a, bfloat %b) {
+ %r = fcmp one bfloat %a, %b
+ ret i1 %r
+}
+
+; CHECK: [[ULT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordLessThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_ult(bfloat %a, bfloat %b) {
+ %r = fcmp ult bfloat %a, %b
+ ret i1 %r
+}
+
+; CHECK: [[OLT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdLessThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_olt(bfloat %a, bfloat %b) {
+ %r = fcmp olt bfloat %a, %b
+ ret i1 %r
+}
+
+; CHECK: [[ULE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordLessThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_ule(bfloat %a, bfloat %b) {
+ %r = fcmp ule bfloat %a, %b
+ ret i1 %r
+}
+
+; CHECK: [[OLE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdLessThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_ole(bfloat %a, bfloat %b) {
+ %r = fcmp ole bfloat %a, %b
+ ret i1 %r
+}
+
+; CHECK: [[UGT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordGreaterThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_ugt(bfloat %a, bfloat %b) {
+ %r = fcmp ugt bfloat %a, %b
+ ret i1 %r
+}
+
+; CHECK: [[OGT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdGreaterThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_ogt(bfloat %a, bfloat %b) {
+ %r = fcmp ogt bfloat %a, %b
+ ret i1 %r
+}
+
+; CHECK: [[UGE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordGreaterThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_uge(bfloat %a, bfloat %b) {
+ %r = fcmp uge bfloat %a, %b
+ ret i1 %r
+}
+
+; CHECK: [[OGE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdGreaterThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_oge(bfloat %a, bfloat %b) {
+ %r = fcmp oge bfloat %a, %b
+ ret i1 %r
+}
+
+; CHECK: [[ORD]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpOrdered {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_ord(bfloat %a, bfloat %b) {
+ %r = fcmp ord bfloat %a, %b
+ ret i1 %r
+}
+
+; CHECK: [[UNO]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpUnordered {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_uno(bfloat %a, bfloat %b) {
+ %r = fcmp uno bfloat %a, %b
+ ret i1 %r
+}
+
+; CHECK: [[v3UEQ]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_ueq(<3 x bfloat> %a, <3 x bfloat> %b) {
+ %r = fcmp ueq <3 x bfloat> %a, %b
+ ret <3 x i1> %r
+}
+
+; CHECK: [[v3OEQ]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_oeq(<3 x bfloat> %a, <3 x bfloat> %b) {
+ %r = fcmp oeq <3 x bfloat> %a, %b
+ ret <3 x i1> %r
+}
+
+; CHECK: [[v3UNE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordNotEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_une(<3 x bfloat> %a, <3 x bfloat> %b) {
+ %r = fcmp une <3 x bfloat> %a, %b
+ ret <3 x i1> %r
+}
+
+; CHECK: [[v3ONE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdNotEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_one(<3 x bfloat> %a, <3 x bfloat> %b) {
+ %r = fcmp one <3 x bfloat> %a, %b
+ ret <3 x i1> %r
+}
+
+; CHECK: [[v3ULT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordLessThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_ult(<3 x bfloat> %a, <3 x bfloat> %b) {
+ %r = fcmp ult <3 x bfloat> %a, %b
+ ret <3 x i1> %r
+}
+
+; CHECK: [[v3OLT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdLessThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_olt(<3 x bfloat> %a, <3 x bfloat> %b) {
+ %r = fcmp olt <3 x bfloat> %a, %b
+ ret <3 x i1> %r
+}
+
+; CHECK: [[v3ULE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordLessThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_ule(<3 x bfloat> %a, <3 x bfloat> %b) {
+ %r = fcmp ule <3 x bfloat> %a, %b
+ ret <3 x i1> %r
+}
+
+; CHECK: [[v3OLE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdLessThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_ole(<3 x bfloat> %a, <3 x bfloat> %b) {
+ %r = fcmp ole <3 x bfloat> %a, %b
+ ret <3 x i1> %r
+}
+
+; CHECK: [[v3UGT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordGreaterThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_ugt(<3 x bfloat> %a, <3 x bfloat> %b) {
+ %r = fcmp ugt <3 x bfloat> %a, %b
+ ret <3 x i1> %r
+}
+
+; CHECK: [[v3OGT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdGreaterThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_ogt(<3 x bfloat> %a, <3 x bfloat> %b) {
+ %r = fcmp ogt <3 x bfloat> %a, %b
+ ret <3 x i1> %r
+}
+
+; CHECK: [[v3UGE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordGreaterThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_uge(<3 x bfloat> %a, <3 x bfloat> %b) {
+ %r = fcmp uge <3 x bfloat> %a, %b
+ ret <3 x i1> %r
+}
+
+; CHECK: [[v3OGE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdGreaterThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_oge(<3 x bfloat> %a, <3 x bfloat> %b) {
+ %r = fcmp oge <3 x bfloat> %a, %b
+ ret <3 x i1> %r
+}
+
+; CHECK: [[v3ORD]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpOrdered {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_ord(<3 x bfloat> %a, <3 x bfloat> %b) {
+ %r = fcmp ord <3 x bfloat> %a, %b
+ ret <3 x i1> %r
+}
+
+; CHECK: [[v3UNO]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpUnordered {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_uno(<3 x bfloat> %a, <3 x bfloat> %b) {
+ %r = fcmp uno <3 x bfloat> %a, %b
+ ret <3 x i1> %r
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_kernel_attributes/max_work_group_size.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_kernel_attributes/max_work_group_size.ll
new file mode 100644
index 0000000..717771c
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_kernel_attributes/max_work_group_size.ll
@@ -0,0 +1,32 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_kernel_attributes %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s
+; %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_kernel_attributes %s -o - -filetype=obj | spirv-val %}
+; %if spirv-tools %{ llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpCapability KernelAttributesINTEL
+; CHECK: OpExtension "SPV_INTEL_kernel_attributes"
+; CHECK: OpEntryPoint {{.*}} %[[DIM1:[0-9]+]] "Dim1"
+; CHECK: OpEntryPoint {{.*}} %[[DIM2:[0-9]+]] "Dim2"
+; CHECK: OpEntryPoint {{.*}} %[[DIM3:[0-9]+]] "Dim3"
+; CHECK: OpExecutionMode %[[DIM1]] MaxWorkgroupSizeINTEL 4 1 1
+; CHECK: OpExecutionMode %[[DIM2]] MaxWorkgroupSizeINTEL 8 4 1
+; CHECK: OpExecutionMode %[[DIM3]] MaxWorkgroupSizeINTEL 16 8 4
+; CHECK: %[[DIM1]] = OpFunction
+; CHECK: %[[DIM2]] = OpFunction
+; CHECK: %[[DIM3]] = OpFunction
+
+define spir_kernel void @Dim1() !max_work_group_size !0 {
+ ret void
+}
+
+define spir_kernel void @Dim2() !max_work_group_size !1 {
+ ret void
+}
+
+define spir_kernel void @Dim3() !max_work_group_size !2 {
+ ret void
+}
+
+!0 = !{i32 4}
+!1 = !{i32 8, i32 4}
+!2 = !{i32 16, i32 8, i32 4}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll b/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll
index f745794..15905dd 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll
@@ -1,4 +1,5 @@
; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=all %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s
define i6 @getConstantI6() {
ret i6 2
diff --git a/llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll b/llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll
index afffd9e..11e7d00 100644
--- a/llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll
+++ b/llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll
@@ -1,4 +1,6 @@
; REQUIRES: spirv-tools
; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s
+; RUN: llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - --filetype=obj | spirv-dis | FileCheck --check-prefix=AMDGCNSPIRV %s
; CHECK: Generator: {{.*}}{{43|LLVM SPIR-V Backend}}{{.*}}
+; AMDGCNSPIRV: Generator: {{.*}}{{65535|LLVM SPIR-V Backend}}{{.*}}
diff --git a/llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll b/llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll
index 686c1e9..49ee993 100644
--- a/llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll
+++ b/llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll
@@ -6,6 +6,7 @@
; RUN: llc -O0 -mtriple=spirv64v1.4-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s --check-prefix=CHECK-SPIRV14
; RUN: llc -O0 -mtriple=spirv64v1.5-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s --check-prefix=CHECK-SPIRV15
; RUN: llc -O0 -mtriple=spirv64v1.6-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s --check-prefix=CHECK-SPIRV16
+; RUN: llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - --filetype=obj | spirv-dis | FileCheck %s --check-prefix=AMDGCNSPIRV
; CHECK-SPIRV10: Version: 1.0
; CHECK-SPIRV11: Version: 1.1
@@ -14,3 +15,4 @@
; CHECK-SPIRV14: Version: 1.4
; CHECK-SPIRV15: Version: 1.5
; CHECK-SPIRV16: Version: 1.6
+; AMDGCNSPIRV: Version: 1.6
diff --git a/llvm/test/CodeGen/SPIRV/unpackfloat2x16.ll b/llvm/test/CodeGen/SPIRV/unpackfloat2x16.ll
new file mode 100644
index 0000000..6a9ce45
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/unpackfloat2x16.ll
@@ -0,0 +1,18 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-vulkan %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: [[SET:%.*]] = OpExtInstImport "GLSL.std.450"
+; CHECK-DAG: [[UINT:%.*]] = OpTypeInt 32 0
+; CHECK-DAG: [[FLOAT:%.*]] = OpTypeFloat 32
+; CHECK-DAG: [[FLOAT2:%.*]] = OpTypeVector [[FLOAT]] 2
+
+; CHECK: [[P0:%.*]] = OpFunctionParameter [[UINT]]
+; CHECK: [[UNPACK2:%.*]] = OpExtInst [[FLOAT2]] [[SET]] UnpackHalf2x16 [[P0]]
+; CHECK: [[UNPACK:%.*]] = OpCompositeExtract [[FLOAT]] [[UNPACK2]] 0
+; CHECK: OpReturnValue [[UNPACK]]
+define hidden spir_func noundef nofpclass(nan inf) float @_Z9test_funcj(i32 noundef %0) local_unnamed_addr #0 {
+ %2 = tail call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.spv.unpackhalf2x16.v2f32(i32 %0)
+ %3 = extractelement <2 x float> %2, i64 0
+ ret float %3
+}
+
diff --git a/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll b/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll
index 678d9a9..ff9b6a3 100644
--- a/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll
+++ b/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll
@@ -22,10 +22,10 @@ define void @main(i16 %in) {
; CHECK-NEXT: locghile %r3, 1
; CHECK-NEXT: o %r0, 0(%r1)
; CHECK-NEXT: larl %r1, g_222
-; CHECK-NEXT: lghi %r5, 0
; CHECK-NEXT: dsgfr %r2, %r0
+; CHECK-NEXT: lghi %r3, 0
; CHECK-NEXT: stgrl %r2, g_39
-; CHECK-NEXT: stc %r5, 19(%r1)
+; CHECK-NEXT: stc %r3, 19(%r1)
; CHECK-NEXT: br %r14
%tmp = load i32, ptr @g_151, align 4
%tmp3 = or i32 %tmp, 1
diff --git a/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll b/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll
index 9c63819..1cfda8a 100644
--- a/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll
+++ b/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll
@@ -10,7 +10,7 @@ declare i32 @has_ptr_arg(ptr)
; CHECK-LABEL: test_invalid_rtn:
; CHECK: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_i64_arg_bitcast_invalid.2, $pop[[L0]]{{$}}
+; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_i64_arg_bitcast_invalid.1, $pop[[L0]]{{$}}
; CHECK-NEXT: drop $pop[[L1]]{{$}}
; CHECK-NEXT: i64.const $push[[L0:[0-9]+]]=, 0{{$}}
; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_i64_arg_bitcast_invalid, $pop[[L0]]{{$}}
@@ -32,7 +32,7 @@ define void @test_struct_rtn() {
; CHECK-LABEL: test_invalid_arg:
; CHECK: i32.const $push[[L0:[0-9]+]]=, 2{{$}}
-; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_ptr_arg_bitcast_invalid.4, $pop[[L0]]{{$}}
+; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_ptr_arg_bitcast_invalid.2, $pop[[L0]]{{$}}
; CHECK-NEXT: drop $pop[[L1]]{{$}}
; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 2{{$}}
; CHECK-NEXT: call $push[[L1:[0-9]+]]=, has_ptr_arg, $pop[[L0]]{{$}}
@@ -54,8 +54,8 @@ entry:
; CHECK-NEXT: unreachable
; CHECK-NEXT: end_function
-; CHECK-LABEL: .Lhas_i64_arg_bitcast_invalid.2:
-; CHECK-NEXT: .functype .Lhas_i64_arg_bitcast_invalid.2 (i32) -> (i32)
+; CHECK-LABEL: .Lhas_i64_arg_bitcast_invalid.1:
+; CHECK-NEXT: .functype .Lhas_i64_arg_bitcast_invalid.1 (i32) -> (i32)
; CHECK-NEXT: unreachable
; CHECK-NEXT: end_function
@@ -64,7 +64,7 @@ entry:
; CHECK-NEXT: unreachable
; CHECK-NEXT: end_function
-; CHECK-LABEL: .Lhas_ptr_arg_bitcast_invalid.4:
-; CHECK-NEXT: .functype .Lhas_ptr_arg_bitcast_invalid.4 (i32) -> (i32)
+; CHECK-LABEL: .Lhas_ptr_arg_bitcast_invalid.2:
+; CHECK-NEXT: .functype .Lhas_ptr_arg_bitcast_invalid.2 (i32) -> (i32)
; CHECK-NEXT: unreachable
; CHECK-NEXT: end_function
diff --git a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll
index 87059c5..6ae7b22 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
+; RUN: opt -mtriple=x86_64 -x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
+; RUN: opt -mtriple=x86_64 -passes=x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
define dso_local void @test_no_bitcast(ptr %A_mem, ptr %B_mem, ptr %C_mem) local_unnamed_addr #0 {
; CHECK-LABEL: @test_no_bitcast(
diff --git a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
index 5fb2dcd..ca7c357 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
+; RUN: opt -mtriple=x86_64 -x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
+; RUN: opt -mtriple=x86_64 -passes=x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, ptr%ptr, i64 %stride, ptr %vptr) {
; CHECK-LABEL: @test_amx_load_non_O0(
diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir b/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir
index 41e1b5b..5c059a4 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir
@@ -1,5 +1,6 @@
-# RUN: llc -mtriple=i386-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
-# RUN: llc -mtriple=x86_64-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=i386-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,X86
+# RUN: llc -mtriple=x86_64-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,X64
--- |
@@ -30,24 +31,23 @@
...
---
name: test_copy
-# ALL-LABEL: name: test_copy
alignment: 16
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '', flags: [ ] }
-# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '', flags: [ ] }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
-# ALL: %0:gr8 = COPY $al
-# ALL-NEXT: %1:gr32 = MOVZX32rr8 %0
-# ALL-NEXT: $eax = COPY %1
-# ALL-NEXT: RET 0, implicit $eax
body: |
bb.1 (%ir-block.0):
liveins: $eax
+ ; CHECK-LABEL: name: test_copy
+ ; CHECK: liveins: $eax
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr8 = COPY $al
+ ; CHECK-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY]]
+ ; CHECK-NEXT: $eax = COPY [[MOVZX32rr8_]]
+ ; CHECK-NEXT: RET 0, implicit $eax
%0(s8) = COPY $al
%1(s32) = G_ZEXT %0(s8)
$eax = COPY %1(s32)
@@ -56,24 +56,23 @@ body: |
...
---
name: test_copy2
-# ALL-LABEL: name: test_copy2
alignment: 16
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '', flags: [ ] }
-# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '', flags: [ ] }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
-# ALL: %0:gr8 = COPY $al
-# ALL-NEXT: %1:gr32 = MOVZX32rr8 %0
-# ALL-NEXT: $eax = COPY %1
-# ALL-NEXT: RET 0, implicit $eax
body: |
bb.1 (%ir-block.0):
liveins: $eax
+ ; CHECK-LABEL: name: test_copy2
+ ; CHECK: liveins: $eax
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr8 = COPY $al
+ ; CHECK-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY]]
+ ; CHECK-NEXT: $eax = COPY [[MOVZX32rr8_]]
+ ; CHECK-NEXT: RET 0, implicit $eax
%0(s8) = COPY $al
%1(s32) = G_ZEXT %0(s8)
$eax = COPY %1(s32)
@@ -82,30 +81,35 @@ body: |
...
---
name: test_copy3
-# ALL-LABEL: name: test_copy3
alignment: 16
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr16[[ABCD:(_abcd)?]], preferred-register: '', flags: [ ] }
-# X32-NEXT: - { id: 1, class: gr8_abcd_l, preferred-register: '', flags: [ ] }
-# X64-NEXT: - { id: 1, class: gr8, preferred-register: '', flags: [ ] }
-# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '', flags: [ ] }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
- { id: 2, class: gpr, preferred-register: '' }
-# ALL: %0:gr16 = COPY $ax
-# X32-NEXT: %3:gr16_abcd = COPY %0
-# X32-NEXT: %1:gr8_abcd_l = COPY %3.sub_8bit
-# X64-NEXT: %1:gr8 = COPY %0.sub_8bit
-# ALL-NEXT: %2:gr32 = MOVZX32rr8 %1
-# ALL-NEXT: $eax = COPY %2
-# ALL-NEXT: RET 0, implicit $eax
body: |
bb.1 (%ir-block.0):
liveins: $eax
+ ; X86-LABEL: name: test_copy3
+ ; X86: liveins: $eax
+ ; X86-NEXT: {{ $}}
+ ; X86-NEXT: [[COPY:%[0-9]+]]:gr16 = COPY $ax
+ ; X86-NEXT: [[COPY1:%[0-9]+]]:gr16_abcd = COPY [[COPY]]
+ ; X86-NEXT: [[COPY2:%[0-9]+]]:gr8_abcd_l = COPY [[COPY1]].sub_8bit
+ ; X86-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY2]]
+ ; X86-NEXT: $eax = COPY [[MOVZX32rr8_]]
+ ; X86-NEXT: RET 0, implicit $eax
+ ;
+ ; X64-LABEL: name: test_copy3
+ ; X64: liveins: $eax
+ ; X64-NEXT: {{ $}}
+ ; X64-NEXT: [[COPY:%[0-9]+]]:gr16 = COPY $ax
+ ; X64-NEXT: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit
+ ; X64-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY1]]
+ ; X64-NEXT: $eax = COPY [[MOVZX32rr8_]]
+ ; X64-NEXT: RET 0, implicit $eax
%0(s16) = COPY $ax
%1(s8) = G_TRUNC %0(s16)
%2(s32) = G_ZEXT %1(s8)
@@ -115,27 +119,25 @@ body: |
...
---
name: test_copy4
-# ALL-LABEL: name: test_copy4
alignment: 16
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '', flags: [ ] }
-# ALL-NEXT: - { id: 1, class: gr16, preferred-register: '', flags: [ ] }
-# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '', flags: [ ] }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
- { id: 2, class: gpr, preferred-register: '' }
-# ALL: %0:gr32 = COPY $eax
-# ALL-NEXT: %1:gr16 = COPY %0.sub_16bit
-# ALL-NEXT: %2:gr32 = MOVZX32rr16 %1
-# ALL-NEXT: $eax = COPY %2
-# ALL-NEXT: RET 0, implicit $eax
body: |
bb.1 (%ir-block.0):
liveins: $eax
+ ; CHECK-LABEL: name: test_copy4
+ ; CHECK: liveins: $eax
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $eax
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit
+ ; CHECK-NEXT: [[MOVZX32rr16_:%[0-9]+]]:gr32 = MOVZX32rr16 [[COPY1]]
+ ; CHECK-NEXT: $eax = COPY [[MOVZX32rr16_]]
+ ; CHECK-NEXT: RET 0, implicit $eax
%0(s32) = COPY $eax
%1(s16) = G_TRUNC %0(s32)
%2(s32) = G_ZEXT %1(s16)
@@ -145,30 +147,35 @@ body: |
...
---
name: test_copy5
-# ALL-LABEL: name: test_copy5
alignment: 16
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr32[[ABCD:(_abcd)?]], preferred-register: '', flags: [ ] }
-# X32-NEXT: - { id: 1, class: gr8_abcd_l, preferred-register: '', flags: [ ] }
-# X64-NEXT: - { id: 1, class: gr8, preferred-register: '', flags: [ ] }
-# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '', flags: [ ] }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
- { id: 2, class: gpr, preferred-register: '' }
-# ALL: %0:gr32 = COPY $edx
-# X32-NEXT: %3:gr32_abcd = COPY %0
-# X32-NEXT: %1:gr8_abcd_l = COPY %3.sub_8bit
-# X64-NEXT: %1:gr8 = COPY %0.sub_8bit
-# ALL-NEXT: %2:gr32 = MOVZX32rr8 %1
-# ALL-NEXT: $eax = COPY %2
-# ALL-NEXT: RET 0, implicit $eax
body: |
bb.1 (%ir-block.0):
liveins: $eax,$edx
+ ; X86-LABEL: name: test_copy5
+ ; X86: liveins: $eax, $edx
+ ; X86-NEXT: {{ $}}
+ ; X86-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
+ ; X86-NEXT: [[COPY1:%[0-9]+]]:gr32_abcd = COPY [[COPY]]
+ ; X86-NEXT: [[COPY2:%[0-9]+]]:gr8_abcd_l = COPY [[COPY1]].sub_8bit
+ ; X86-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY2]]
+ ; X86-NEXT: $eax = COPY [[MOVZX32rr8_]]
+ ; X86-NEXT: RET 0, implicit $eax
+ ;
+ ; X64-LABEL: name: test_copy5
+ ; X64: liveins: $eax, $edx
+ ; X64-NEXT: {{ $}}
+ ; X64-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
+ ; X64-NEXT: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit
+ ; X64-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY1]]
+ ; X64-NEXT: $eax = COPY [[MOVZX32rr8_]]
+ ; X64-NEXT: RET 0, implicit $eax
%0(s32) = COPY $edx
%1(s8) = G_TRUNC %0(s32)
%2(s32) = G_ANYEXT %1(s8)
@@ -178,29 +185,26 @@ body: |
...
---
name: test_copy6
-# ALL-LABEL: name: test_copy6
alignment: 16
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '', flags: [ ] }
-# ALL-NEXT: - { id: 1, class: gr16, preferred-register: '', flags: [ ] }
-# ALL-NEXT: - { id: 2, class: low32_addr_access_rbp, preferred-register: '', flags: [ ] }
-# ALL-NEXT: - { id: 3, class: low32_addr_access_rbp, preferred-register: '', flags: [ ] }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
- { id: 2, class: gpr, preferred-register: '' }
-# ALL: %0:gr32 = COPY $edx
-# ALL-NEXT: %1:gr16 = COPY %0.sub_16bit
-# ALL-NEXT: %3:low32_addr_access_rbp = IMPLICIT_DEF
-# ALL-NEXT: %2:low32_addr_access_rbp = INSERT_SUBREG %3, %1, %subreg.sub_16bit
-# ALL-NEXT: $eax = COPY %2
-# ALL-NEXT: RET 0, implicit $eax
body: |
bb.1 (%ir-block.0):
liveins: $eax,$edx
+ ; CHECK-LABEL: name: test_copy6
+ ; CHECK: liveins: $eax, $edx
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:low32_addr_access_rbp = IMPLICIT_DEF
+ ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:low32_addr_access_rbp = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.sub_16bit
+ ; CHECK-NEXT: $eax = COPY [[INSERT_SUBREG]]
+ ; CHECK-NEXT: RET 0, implicit $eax
%0(s32) = COPY $edx
%1(s16) = G_TRUNC %0(s32)
%2(s32) = G_ANYEXT %1(s16)
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index c311ab8..32d2252 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -356,41 +356,20 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $1, %edx
-; X86-NEXT: xorl %esi, %esi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: xorl %edi, %edi
-; X86-NEXT: shldl %cl, %eax, %edi
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: testb $32, %cl
-; X86-NEXT: je .LBB9_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl $0, %edx
-; X86-NEXT: .LBB9_2:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: notl %esi
-; X86-NEXT: notl %edx
-; X86-NEXT: je .LBB9_4
-; X86-NEXT: # %bb.3:
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: .LBB9_4:
-; X86-NEXT: andl 4(%ebx), %esi
-; X86-NEXT: orl %edi, %esi
-; X86-NEXT: andl (%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: andl $32, %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: movl (%ebx,%eax), %eax
-; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: andl $32, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%edx,%esi), %edi
+; X86-NEXT: btl %ecx, %edi
; X86-NEXT: setae %al
-; X86-NEXT: movl %edx, (%ebx)
-; X86-NEXT: movl %esi, 4(%ebx)
+; X86-NEXT: btrl %ecx, %edi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, (%edx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -600,208 +579,55 @@ define i1 @set_ne_i128(ptr %word, i32 %position) nounwind {
define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-LABEL: init_eq_i128:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $96, %esp
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movzbl 16(%ebp), %ebx
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $12, %al
-; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %eax
-; X86-NEXT: movl 64(%esp,%eax), %edx
-; X86-NEXT: movl 68(%esp,%eax), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: movzbl %bl, %eax
-; X86-NEXT: movl 72(%esp,%esi), %ebx
-; X86-NEXT: movl 76(%esp,%esi), %esi
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edi
-; X86-NEXT: shldl %cl, %ebx, %esi
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: shll %cl, %ebx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: notl %edi
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT: movl 36(%esp,%ecx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%esp,%ecx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: andl 8(%eax), %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: notl %esi
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: movl 44(%esp,%eax), %eax
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: andl 12(%ecx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: movl 32(%esp,%eax), %edx
-; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: andl (%eax), %ebx
-; X86-NEXT: orl %edx, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl (%esp), %esi # 4-byte Reload
-; X86-NEXT: shldl %cl, %esi, %eax
-; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: andl 4(%ecx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl 12(%ebp), %esi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: andl $96, %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: movl (%ecx,%eax), %eax
-; X86-NEXT: btl %esi, %eax
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movl %edi, 8(%ecx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 12(%eax)
-; X86-NEXT: movl %ebx, (%eax)
-; X86-NEXT: movl %edx, 4(%eax)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: andl $96, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%edx,%esi), %edi
+; X86-NEXT: btl %ecx, %edi
; X86-NEXT: setae %al
-; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: btrl %ecx, %edi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, (%edx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; SSE-LABEL: init_eq_i128:
; SSE: # %bb.0:
; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: movl $1, %esi
-; SSE-NEXT: xorl %r8d, %r8d
-; SSE-NEXT: shldq %cl, %rsi, %r8
-; SSE-NEXT: shlq %cl, %rsi
-; SSE-NEXT: movl %edx, %eax
-; SSE-NEXT: xorl %edx, %edx
-; SSE-NEXT: shldq %cl, %rax, %rdx
-; SSE-NEXT: shlq %cl, %rax
-; SSE-NEXT: xorl %r9d, %r9d
-; SSE-NEXT: testb $64, %cl
-; SSE-NEXT: cmovneq %rsi, %r8
-; SSE-NEXT: cmovneq %r9, %rsi
-; SSE-NEXT: notq %r8
-; SSE-NEXT: cmovneq %rax, %rdx
-; SSE-NEXT: cmovneq %r9, %rax
-; SSE-NEXT: notq %rsi
-; SSE-NEXT: andq 8(%rdi), %r8
-; SSE-NEXT: orq %rdx, %r8
-; SSE-NEXT: andq (%rdi), %rsi
-; SSE-NEXT: orq %rax, %rsi
-; SSE-NEXT: movl %ecx, %eax
-; SSE-NEXT: andl $96, %eax
-; SSE-NEXT: shrl $3, %eax
-; SSE-NEXT: movl (%rdi,%rax), %eax
-; SSE-NEXT: btl %ecx, %eax
+; SSE-NEXT: andl $96, %esi
+; SSE-NEXT: shrl $3, %esi
+; SSE-NEXT: movl (%rdi,%rsi), %r8d
+; SSE-NEXT: btl %ecx, %r8d
; SSE-NEXT: setae %al
-; SSE-NEXT: movq %rsi, (%rdi)
-; SSE-NEXT: movq %r8, 8(%rdi)
+; SSE-NEXT: shll %cl, %edx
+; SSE-NEXT: btrl %ecx, %r8d
+; SSE-NEXT: orl %r8d, %edx
+; SSE-NEXT: movl %edx, (%rdi,%rsi)
; SSE-NEXT: retq
;
-; AVX2-LABEL: init_eq_i128:
-; AVX2: # %bb.0:
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: movl $1, %eax
-; AVX2-NEXT: xorl %esi, %esi
-; AVX2-NEXT: shldq %cl, %rax, %rsi
-; AVX2-NEXT: movl %edx, %edx
-; AVX2-NEXT: xorl %r8d, %r8d
-; AVX2-NEXT: shldq %cl, %rdx, %r8
-; AVX2-NEXT: xorl %r9d, %r9d
-; AVX2-NEXT: shlxq %rcx, %rax, %rax
-; AVX2-NEXT: testb $64, %cl
-; AVX2-NEXT: cmovneq %rax, %rsi
-; AVX2-NEXT: cmovneq %r9, %rax
-; AVX2-NEXT: shlxq %rcx, %rdx, %rdx
-; AVX2-NEXT: cmovneq %rdx, %r8
-; AVX2-NEXT: cmovneq %r9, %rdx
-; AVX2-NEXT: andnq 8(%rdi), %rsi, %rsi
-; AVX2-NEXT: orq %r8, %rsi
-; AVX2-NEXT: andnq (%rdi), %rax, %r8
-; AVX2-NEXT: orq %rdx, %r8
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $96, %eax
-; AVX2-NEXT: shrl $3, %eax
-; AVX2-NEXT: movl (%rdi,%rax), %eax
-; AVX2-NEXT: btl %ecx, %eax
-; AVX2-NEXT: setae %al
-; AVX2-NEXT: movq %r8, (%rdi)
-; AVX2-NEXT: movq %rsi, 8(%rdi)
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: init_eq_i128:
-; AVX512: # %bb.0:
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: movl $1, %eax
-; AVX512-NEXT: xorl %esi, %esi
-; AVX512-NEXT: shldq %cl, %rax, %rsi
-; AVX512-NEXT: xorl %r8d, %r8d
-; AVX512-NEXT: shlxq %rcx, %rax, %rax
-; AVX512-NEXT: movl %edx, %edx
-; AVX512-NEXT: xorl %r9d, %r9d
-; AVX512-NEXT: shldq %cl, %rdx, %r9
-; AVX512-NEXT: testb $64, %cl
-; AVX512-NEXT: cmovneq %rax, %rsi
-; AVX512-NEXT: cmovneq %r8, %rax
-; AVX512-NEXT: shlxq %rcx, %rdx, %rdx
-; AVX512-NEXT: cmovneq %rdx, %r9
-; AVX512-NEXT: cmovneq %r8, %rdx
-; AVX512-NEXT: andnq 8(%rdi), %rsi, %rsi
-; AVX512-NEXT: orq %r9, %rsi
-; AVX512-NEXT: andnq (%rdi), %rax, %r8
-; AVX512-NEXT: orq %rdx, %r8
-; AVX512-NEXT: movl %ecx, %eax
-; AVX512-NEXT: andl $96, %eax
-; AVX512-NEXT: shrl $3, %eax
-; AVX512-NEXT: movl (%rdi,%rax), %eax
-; AVX512-NEXT: btl %ecx, %eax
-; AVX512-NEXT: setae %al
-; AVX512-NEXT: movq %r8, (%rdi)
-; AVX512-NEXT: movq %rsi, 8(%rdi)
-; AVX512-NEXT: retq
+; AVX-LABEL: init_eq_i128:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %esi, %ecx
+; AVX-NEXT: andl $96, %ecx
+; AVX-NEXT: shrl $3, %ecx
+; AVX-NEXT: movl (%rdi,%rcx), %r8d
+; AVX-NEXT: btl %esi, %r8d
+; AVX-NEXT: setae %al
+; AVX-NEXT: btrl %esi, %r8d
+; AVX-NEXT: shlxl %esi, %edx, %edx
+; AVX-NEXT: orl %r8d, %edx
+; AVX-NEXT: movl %edx, (%rdi,%rcx)
+; AVX-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -977,673 +803,55 @@ define i1 @set_ne_i512(ptr %word, i32 %position) nounwind {
define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-LABEL: init_eq_i512:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $352, %esp # imm = 0x160
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: shrl $3, %edx
-; X86-NEXT: andl $60, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT: subl %edx, %eax
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 56(%eax), %esi
-; X86-NEXT: movl 60(%eax), %ebx
-; X86-NEXT: movl 52(%eax), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 48(%eax), %edi
-; X86-NEXT: movl 44(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 24(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 4(%eax), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl 16(%ebp), %eax
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: andl $31, %ecx
-; X86-NEXT: shldl %cl, %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl %cl, %ebx, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl %cl, %esi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 56(%eax), %esi
-; X86-NEXT: movl 60(%eax), %edi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: movl 8(%ebp), %edx
-; X86-NEXT: andl 60(%edx), %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 52(%eax), %edi
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: andl 56(%edx), %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 48(%eax), %esi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: andl 52(%edx), %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 44(%eax), %edi
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: andl 48(%edx), %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 40(%eax), %esi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: andl 44(%edx), %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 36(%eax), %edi
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: andl 40(%edx), %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 32(%eax), %esi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: andl 36(%edx), %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 28(%eax), %edi
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: andl 32(%edx), %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 24(%eax), %esi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: andl 28(%edx), %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 20(%eax), %edi
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: andl 24(%edx), %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 16(%eax), %esi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: andl 20(%edx), %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 12(%eax), %edi
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: andl 16(%edx), %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 8(%eax), %esi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: andl 12(%edx), %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 4(%eax), %edi
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: andl 8(%edx), %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: notl %esi
-; X86-NEXT: movl (%eax), %eax
-; X86-NEXT: shldl %cl, %eax, %edi
-; X86-NEXT: andl 4(%edx), %esi
-; X86-NEXT: orl %edi, %esi
-; X86-NEXT: movl %esi, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: notl %esi
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: andl (%edx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl (%edx,%eax), %eax
-; X86-NEXT: btl %ecx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 60(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 56(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 52(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 48(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 44(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 40(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 36(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 32(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 28(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 24(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 20(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 16(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 12(%edx)
-; X86-NEXT: movl %ebx, 8(%edx)
-; X86-NEXT: movl %edi, 4(%edx)
-; X86-NEXT: movl %esi, (%edx)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: andl $60, %esi
+; X86-NEXT: movl (%edx,%esi), %edi
+; X86-NEXT: btl %ecx, %edi
; X86-NEXT: setae %al
-; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: btrl %ecx, %edi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, (%edx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; SSE-LABEL: init_eq_i512:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: pushq %r15
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %r13
-; SSE-NEXT: pushq %r12
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: subq $168, %rsp
-; SSE-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp)
; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: andl $63, %ecx
-; SSE-NEXT: movl %esi, %eax
-; SSE-NEXT: shrl $3, %eax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: # kill: def $eax killed $eax killed $rax
-; SSE-NEXT: andl $56, %eax
-; SSE-NEXT: negl %eax
-; SSE-NEXT: movslq %eax, %r12
-; SSE-NEXT: movq 136(%rsp,%r12), %r9
-; SSE-NEXT: movq 144(%rsp,%r12), %rax
-; SSE-NEXT: movq %rax, %rsi
-; SSE-NEXT: shldq %cl, %r9, %rsi
-; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 152(%rsp,%r12), %r11
-; SSE-NEXT: shldq %cl, %rax, %r11
-; SSE-NEXT: movq 120(%rsp,%r12), %r10
-; SSE-NEXT: movq 128(%rsp,%r12), %rax
-; SSE-NEXT: movq %rax, %rbx
-; SSE-NEXT: shldq %cl, %r10, %rbx
-; SSE-NEXT: shldq %cl, %rax, %r9
-; SSE-NEXT: movq 104(%rsp,%r12), %r14
-; SSE-NEXT: movq 112(%rsp,%r12), %rax
-; SSE-NEXT: movq %rax, %r15
-; SSE-NEXT: shldq %cl, %r14, %r15
-; SSE-NEXT: shldq %cl, %rax, %r10
-; SSE-NEXT: movq 96(%rsp,%r12), %rax
-; SSE-NEXT: movq %rax, %r13
-; SSE-NEXT: shlq %cl, %r13
-; SSE-NEXT: shldq %cl, %rax, %r14
-; SSE-NEXT: movl %edx, %eax
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movq 8(%rsp,%r12), %r8
-; SSE-NEXT: movq 16(%rsp,%r12), %rsi
-; SSE-NEXT: movq %rsi, %rbp
-; SSE-NEXT: shldq %cl, %r8, %rbp
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: notq %rax
-; SSE-NEXT: andq 48(%rdi), %rax
-; SSE-NEXT: orq %rbp, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: notq %rbx
-; SSE-NEXT: notq %r11
-; SSE-NEXT: movq 24(%rsp,%r12), %rax
-; SSE-NEXT: shldq %cl, %rsi, %rax
-; SSE-NEXT: movq -8(%rsp,%r12), %rbp
-; SSE-NEXT: movq (%rsp,%r12), %rdx
-; SSE-NEXT: movq %rdx, %rsi
-; SSE-NEXT: shldq %cl, %rbp, %rsi
-; SSE-NEXT: andq 56(%rdi), %r11
-; SSE-NEXT: andq 32(%rdi), %rbx
-; SSE-NEXT: orq %rax, %r11
-; SSE-NEXT: orq %rsi, %rbx
-; SSE-NEXT: notq %r15
-; SSE-NEXT: shldq %cl, %rdx, %r8
-; SSE-NEXT: notq %r9
-; SSE-NEXT: andq 40(%rdi), %r9
-; SSE-NEXT: orq %r8, %r9
-; SSE-NEXT: movq -24(%rsp,%r12), %rax
-; SSE-NEXT: movq -16(%rsp,%r12), %rdx
-; SSE-NEXT: movq %rdx, %rsi
-; SSE-NEXT: shldq %cl, %rax, %rsi
-; SSE-NEXT: andq 16(%rdi), %r15
-; SSE-NEXT: orq %rsi, %r15
-; SSE-NEXT: shldq %cl, %rdx, %rbp
-; SSE-NEXT: notq %r10
-; SSE-NEXT: notq %r13
-; SSE-NEXT: movq -32(%rsp,%r12), %rdx
-; SSE-NEXT: movq %rdx, %rsi
-; SSE-NEXT: shlq %cl, %rsi
-; SSE-NEXT: andq 24(%rdi), %r10
-; SSE-NEXT: andq (%rdi), %r13
-; SSE-NEXT: orq %rbp, %r10
-; SSE-NEXT: orq %rsi, %r13
-; SSE-NEXT: notq %r14
-; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: andq 8(%rdi), %r14
-; SSE-NEXT: orq %rax, %r14
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andl $60, %eax
-; SSE-NEXT: movl (%rdi,%rax), %eax
-; SSE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; SSE-NEXT: btl %ecx, %eax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: movq %rax, 48(%rdi)
-; SSE-NEXT: movq %r11, 56(%rdi)
-; SSE-NEXT: movq %rbx, 32(%rdi)
-; SSE-NEXT: movq %r9, 40(%rdi)
-; SSE-NEXT: movq %r15, 16(%rdi)
-; SSE-NEXT: movq %r10, 24(%rdi)
-; SSE-NEXT: movq %r13, (%rdi)
-; SSE-NEXT: movq %r14, 8(%rdi)
+; SSE-NEXT: shrl $3, %esi
+; SSE-NEXT: andl $60, %esi
+; SSE-NEXT: movl (%rdi,%rsi), %r8d
+; SSE-NEXT: btl %ecx, %r8d
; SSE-NEXT: setae %al
-; SSE-NEXT: addq $168, %rsp
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r12
-; SSE-NEXT: popq %r13
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: shll %cl, %edx
+; SSE-NEXT: btrl %ecx, %r8d
+; SSE-NEXT: orl %r8d, %edx
+; SSE-NEXT: movl %edx, (%rdi,%rsi)
; SSE-NEXT: retq
;
-; AVX2-LABEL: init_eq_i512:
-; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %r13
-; AVX2-NEXT: pushq %r12
-; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: subq $184, %rsp
-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1,0,0,0]
-; AVX2-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: movl %esi, %ebx
-; AVX2-NEXT: shrl $3, %ebx
-; AVX2-NEXT: movl %ebx, %eax
-; AVX2-NEXT: andl $56, %eax
-; AVX2-NEXT: negl %eax
-; AVX2-NEXT: movslq %eax, %r11
-; AVX2-NEXT: movq 128(%rsp,%r11), %r15
-; AVX2-NEXT: movq 136(%rsp,%r11), %rax
-; AVX2-NEXT: movq %rax, %rsi
-; AVX2-NEXT: shldq %cl, %r15, %rsi
-; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 120(%rsp,%r11), %r8
-; AVX2-NEXT: shldq %cl, %r8, %r15
-; AVX2-NEXT: movq 144(%rsp,%r11), %r14
-; AVX2-NEXT: movq 152(%rsp,%r11), %rsi
-; AVX2-NEXT: movq %rsi, %r9
-; AVX2-NEXT: shldq %cl, %r14, %r9
-; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rax, %r14
-; AVX2-NEXT: movq 112(%rsp,%r11), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 160(%rsp,%r11), %r13
-; AVX2-NEXT: movq 168(%rsp,%r11), %r12
-; AVX2-NEXT: shldq %cl, %r13, %r12
-; AVX2-NEXT: shldq %cl, %rsi, %r13
-; AVX2-NEXT: shldq %cl, %rax, %r8
-; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movl %edx, %eax
-; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq 24(%rsp,%r11), %rbp
-; AVX2-NEXT: movq 32(%rsp,%r11), %rdx
-; AVX2-NEXT: movq %rdx, %rax
-; AVX2-NEXT: shldq %cl, %rbp, %rax
-; AVX2-NEXT: movq 40(%rsp,%r11), %r10
-; AVX2-NEXT: shldq %cl, %rdx, %r10
-; AVX2-NEXT: movq 8(%rsp,%r11), %r9
-; AVX2-NEXT: movq 16(%rsp,%r11), %rdx
-; AVX2-NEXT: movq %rdx, %r8
-; AVX2-NEXT: shldq %cl, %r9, %r8
-; AVX2-NEXT: shldq %cl, %rdx, %rbp
-; AVX2-NEXT: andnq 48(%rdi), %r13, %r13
-; AVX2-NEXT: orq %rax, %r13
-; AVX2-NEXT: movq -8(%rsp,%r11), %rax
-; AVX2-NEXT: movq (%rsp,%r11), %rdx
-; AVX2-NEXT: movq %rdx, %rsi
-; AVX2-NEXT: shldq %cl, %rax, %rsi
-; AVX2-NEXT: shldq %cl, %rdx, %r9
-; AVX2-NEXT: andnq 56(%rdi), %r12, %r12
-; AVX2-NEXT: andnq 32(%rdi), %r14, %r14
-; AVX2-NEXT: orq %r10, %r12
-; AVX2-NEXT: orq %r8, %r14
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX2-NEXT: andnq 40(%rdi), %rdx, %rdx
-; AVX2-NEXT: orq %rbp, %rdx
-; AVX2-NEXT: shlxq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX2-NEXT: movq -16(%rsp,%r11), %r10
-; AVX2-NEXT: shlxq %rcx, %r10, %r11
-; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX2-NEXT: shldq %cl, %r10, %rax
-; AVX2-NEXT: andnq 16(%rdi), %r15, %rcx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: andnq 24(%rdi), %r10, %r10
-; AVX2-NEXT: orq %rsi, %rcx
-; AVX2-NEXT: orq %r9, %r10
-; AVX2-NEXT: andnq (%rdi), %r8, %rsi
-; AVX2-NEXT: orq %r11, %rsi
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andnq 8(%rdi), %r8, %r8
-; AVX2-NEXT: orq %rax, %r8
-; AVX2-NEXT: andl $60, %ebx
-; AVX2-NEXT: movl (%rdi,%rbx), %eax
-; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload
-; AVX2-NEXT: btl %r9d, %eax
-; AVX2-NEXT: movq %r13, 48(%rdi)
-; AVX2-NEXT: movq %r12, 56(%rdi)
-; AVX2-NEXT: movq %r14, 32(%rdi)
-; AVX2-NEXT: movq %rdx, 40(%rdi)
-; AVX2-NEXT: movq %rcx, 16(%rdi)
-; AVX2-NEXT: movq %r10, 24(%rdi)
-; AVX2-NEXT: movq %rsi, (%rdi)
-; AVX2-NEXT: movq %r8, 8(%rdi)
-; AVX2-NEXT: setae %al
-; AVX2-NEXT: addq $184, %rsp
-; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r12
-; AVX2-NEXT: popq %r13
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
-; AVX2-NEXT: popq %rbp
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: init_eq_i512:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %r13
-; AVX512-NEXT: pushq %r12
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: subq $168, %rsp
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1,0,0,0]
-; AVX512-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: andl $63, %ecx
-; AVX512-NEXT: movl %esi, %r10d
-; AVX512-NEXT: shrl $3, %r10d
-; AVX512-NEXT: movl %r10d, %r8d
-; AVX512-NEXT: andl $56, %r8d
-; AVX512-NEXT: negl %r8d
-; AVX512-NEXT: movslq %r8d, %r9
-; AVX512-NEXT: movq 112(%rsp,%r9), %r11
-; AVX512-NEXT: movq 120(%rsp,%r9), %r14
-; AVX512-NEXT: movq %r14, %rax
-; AVX512-NEXT: shldq %cl, %r11, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 104(%rsp,%r9), %rax
-; AVX512-NEXT: shldq %cl, %rax, %r11
-; AVX512-NEXT: movq 128(%rsp,%r9), %r15
-; AVX512-NEXT: movq 136(%rsp,%r9), %rbp
-; AVX512-NEXT: movq %rbp, %rbx
-; AVX512-NEXT: shldq %cl, %r15, %rbx
-; AVX512-NEXT: shldq %cl, %r14, %r15
-; AVX512-NEXT: movq 144(%rsp,%r9), %r13
-; AVX512-NEXT: movq 152(%rsp,%r9), %r12
-; AVX512-NEXT: shldq %cl, %r13, %r12
-; AVX512-NEXT: movq 96(%rsp,%r9), %r14
-; AVX512-NEXT: shldq %cl, %rbp, %r13
-; AVX512-NEXT: shldq %cl, %r14, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movl %edx, %edx
-; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq 8(%rsp,%r9), %r8
-; AVX512-NEXT: movq 16(%rsp,%r9), %rax
-; AVX512-NEXT: movq %rax, %rbp
-; AVX512-NEXT: shldq %cl, %r8, %rbp
-; AVX512-NEXT: andnq 48(%rdi), %r13, %r13
-; AVX512-NEXT: orq %rbp, %r13
-; AVX512-NEXT: movq 24(%rsp,%r9), %rbp
-; AVX512-NEXT: shldq %cl, %rax, %rbp
-; AVX512-NEXT: movq -8(%rsp,%r9), %rax
-; AVX512-NEXT: movq (%rsp,%r9), %rsi
-; AVX512-NEXT: movq %rsi, %rdx
-; AVX512-NEXT: shldq %cl, %rax, %rdx
-; AVX512-NEXT: andnq 56(%rdi), %r12, %r12
-; AVX512-NEXT: orq %rbp, %r12
-; AVX512-NEXT: andnq 32(%rdi), %r15, %r15
-; AVX512-NEXT: orq %rdx, %r15
-; AVX512-NEXT: shldq %cl, %rsi, %r8
-; AVX512-NEXT: movq -24(%rsp,%r9), %rdx
-; AVX512-NEXT: movq -16(%rsp,%r9), %rsi
-; AVX512-NEXT: movq %rsi, %rbp
-; AVX512-NEXT: shldq %cl, %rdx, %rbp
-; AVX512-NEXT: andnq 40(%rdi), %rbx, %rbx
-; AVX512-NEXT: orq %r8, %rbx
-; AVX512-NEXT: andnq 16(%rdi), %r11, %r8
-; AVX512-NEXT: orq %rbp, %r8
-; AVX512-NEXT: shlxq %rcx, %r14, %r11
-; AVX512-NEXT: movq -32(%rsp,%r9), %r9
-; AVX512-NEXT: shldq %cl, %rsi, %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX512-NEXT: andnq 24(%rdi), %rsi, %rsi
-; AVX512-NEXT: orq %rax, %rsi
-; AVX512-NEXT: shlxq %rcx, %r9, %rax
-; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT: shldq %cl, %r9, %rdx
-; AVX512-NEXT: andnq (%rdi), %r11, %rcx
-; AVX512-NEXT: orq %rax, %rcx
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andnq 8(%rdi), %rax, %rax
-; AVX512-NEXT: orq %rdx, %rax
-; AVX512-NEXT: andl $60, %r10d
-; AVX512-NEXT: movl (%rdi,%r10), %edx
-; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload
-; AVX512-NEXT: btl %r9d, %edx
-; AVX512-NEXT: movq %r13, 48(%rdi)
-; AVX512-NEXT: movq %r12, 56(%rdi)
-; AVX512-NEXT: movq %r15, 32(%rdi)
-; AVX512-NEXT: movq %rbx, 40(%rdi)
-; AVX512-NEXT: movq %r8, 16(%rdi)
-; AVX512-NEXT: movq %rsi, 24(%rdi)
-; AVX512-NEXT: movq %rcx, (%rdi)
-; AVX512-NEXT: movq %rax, 8(%rdi)
-; AVX512-NEXT: setae %al
-; AVX512-NEXT: addq $168, %rsp
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r12
-; AVX512-NEXT: popq %r13
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: popq %r15
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX-LABEL: init_eq_i512:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %esi, %ecx
+; AVX-NEXT: shrl $3, %ecx
+; AVX-NEXT: andl $60, %ecx
+; AVX-NEXT: movl (%rdi,%rcx), %r8d
+; AVX-NEXT: btl %esi, %r8d
+; AVX-NEXT: setae %al
+; AVX-NEXT: btrl %esi, %r8d
+; AVX-NEXT: shlxl %esi, %edx, %edx
+; AVX-NEXT: orl %r8d, %edx
+; AVX-NEXT: movl %edx, (%rdi,%rcx)
+; AVX-NEXT: retq
%rem = and i32 %position, 511
%ofs = zext nneg i32 %rem to i512
%bit = shl nuw i512 1, %ofs
@@ -1698,115 +906,46 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {
define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: complement_cmpz_i128:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movzbl 12(%ebp), %ecx
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $12, %al
-; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %esi
-; X86-NEXT: movl 36(%esp,%esi), %eax
-; X86-NEXT: movl 40(%esp,%esi), %edi
-; X86-NEXT: movl %edi, %edx
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl 32(%esp,%esi), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%esp,%esi), %esi
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: shll %cl, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl %cl, %ebx, %eax
-; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: xorl 12(%ecx), %esi
-; X86-NEXT: xorl 8(%ecx), %edx
-; X86-NEXT: xorl 4(%ecx), %eax
-; X86-NEXT: xorl (%ecx), %edi
-; X86-NEXT: movl %edx, 8(%ecx)
-; X86-NEXT: movl %esi, 12(%ecx)
-; X86-NEXT: movl %edi, (%ecx)
-; X86-NEXT: movl %eax, 4(%ecx)
-; X86-NEXT: orl %esi, %eax
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: orl %eax, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $1, %edx
+; X86-NEXT: shll %cl, %edx
+; X86-NEXT: andl $96, %ecx
+; X86-NEXT: shrl $3, %ecx
+; X86-NEXT: xorl %edx, (%eax,%ecx)
+; X86-NEXT: movl (%eax), %ecx
+; X86-NEXT: movl 4(%eax), %edx
+; X86-NEXT: orl 12(%eax), %edx
+; X86-NEXT: orl 8(%eax), %ecx
+; X86-NEXT: orl %edx, %ecx
; X86-NEXT: setne %al
-; X86-NEXT: leal -12(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; SSE-LABEL: complement_cmpz_i128:
; SSE: # %bb.0:
; SSE-NEXT: movl %esi, %ecx
; SSE-NEXT: movl $1, %eax
-; SSE-NEXT: xorl %edx, %edx
-; SSE-NEXT: shldq %cl, %rax, %rdx
-; SSE-NEXT: shlq %cl, %rax
-; SSE-NEXT: xorl %esi, %esi
-; SSE-NEXT: testb $64, %cl
-; SSE-NEXT: cmovneq %rax, %rdx
-; SSE-NEXT: cmovneq %rsi, %rax
-; SSE-NEXT: xorq 8(%rdi), %rdx
-; SSE-NEXT: xorq (%rdi), %rax
-; SSE-NEXT: movq %rax, (%rdi)
-; SSE-NEXT: movq %rdx, 8(%rdi)
-; SSE-NEXT: orq %rdx, %rax
+; SSE-NEXT: shll %cl, %eax
+; SSE-NEXT: andl $96, %ecx
+; SSE-NEXT: shrl $3, %ecx
+; SSE-NEXT: xorl %eax, (%rdi,%rcx)
+; SSE-NEXT: movq (%rdi), %rax
+; SSE-NEXT: orq 8(%rdi), %rax
; SSE-NEXT: setne %al
; SSE-NEXT: retq
;
-; AVX2-LABEL: complement_cmpz_i128:
-; AVX2: # %bb.0:
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: movl $1, %eax
-; AVX2-NEXT: xorl %edx, %edx
-; AVX2-NEXT: shldq %cl, %rax, %rdx
-; AVX2-NEXT: xorl %esi, %esi
-; AVX2-NEXT: shlxq %rcx, %rax, %rax
-; AVX2-NEXT: testb $64, %cl
-; AVX2-NEXT: cmovneq %rax, %rdx
-; AVX2-NEXT: cmovneq %rsi, %rax
-; AVX2-NEXT: xorq 8(%rdi), %rdx
-; AVX2-NEXT: xorq (%rdi), %rax
-; AVX2-NEXT: movq %rax, (%rdi)
-; AVX2-NEXT: movq %rdx, 8(%rdi)
-; AVX2-NEXT: orq %rdx, %rax
-; AVX2-NEXT: setne %al
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: complement_cmpz_i128:
-; AVX512: # %bb.0:
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: movl $1, %edx
-; AVX512-NEXT: xorl %esi, %esi
-; AVX512-NEXT: shldq %cl, %rdx, %rsi
-; AVX512-NEXT: shlxq %rcx, %rdx, %rdx
-; AVX512-NEXT: testb $64, %cl
-; AVX512-NEXT: cmovneq %rdx, %rsi
-; AVX512-NEXT: cmovneq %rax, %rdx
-; AVX512-NEXT: xorq 8(%rdi), %rsi
-; AVX512-NEXT: xorq (%rdi), %rdx
-; AVX512-NEXT: movq %rdx, (%rdi)
-; AVX512-NEXT: movq %rsi, 8(%rdi)
-; AVX512-NEXT: orq %rsi, %rdx
-; AVX512-NEXT: setne %al
-; AVX512-NEXT: retq
+; AVX-LABEL: complement_cmpz_i128:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $esi killed $esi def $rsi
+; AVX-NEXT: movl $1, %eax
+; AVX-NEXT: shlxl %esi, %eax, %eax
+; AVX-NEXT: andl $96, %esi
+; AVX-NEXT: shrl $3, %esi
+; AVX-NEXT: xorl %eax, (%rdi,%rsi)
+; AVX-NEXT: movq (%rdi), %rax
+; AVX-NEXT: orq 8(%rdi), %rax
+; AVX-NEXT: setne %al
+; AVX-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -1821,14 +960,152 @@ define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind {
define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
; X86-LABEL: reset_multiload_i128:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl (%eax), %eax
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: andl $96, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: btrl %edx, %ebx
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: movl %ebx, (%ecx,%esi)
+; X86-NEXT: jae .LBB22_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: .LBB22_2:
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: retl
+;
+; X64-LABEL: reset_multiload_i128:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: andl $96, %ecx
+; X64-NEXT: shrl $3, %ecx
+; X64-NEXT: movl (%rdi,%rcx), %r9d
+; X64-NEXT: movl %r9d, %r8d
+; X64-NEXT: btrl %esi, %r8d
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: btl %esi, %r9d
+; X64-NEXT: jb .LBB22_2
+; X64-NEXT: # %bb.1:
+; X64-NEXT: movl (%rdx), %eax
+; X64-NEXT: .LBB22_2:
+; X64-NEXT: movl %r8d, (%rdi,%rcx)
+; X64-NEXT: retq
+ %rem = and i32 %position, 127
+ %ofs = zext nneg i32 %rem to i128
+ %bit = shl nuw i128 1, %ofs
+ %mask = xor i128 %bit, -1
+ %ld = load i128, ptr %word
+ %sel = load i32, ptr %p
+ %test = and i128 %ld, %bit
+ %res = and i128 %ld, %mask
+ %cmp = icmp eq i128 %test, 0
+ store i128 %res, ptr %word
+ %ret = select i1 %cmp, i32 %sel, i32 0
+ ret i32 %ret
+}
+
+; Multiple uses of the store chain AND stored value
+define i32 @chain_reset_i256(ptr %p0, ptr %p1, ptr %p2, i32 %position) nounwind {
+; X86-LABEL: chain_reset_i256:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $-2, %edi
+; X86-NEXT: roll %cl, %edi
+; X86-NEXT: shrl $3, %ecx
+; X86-NEXT: andl $28, %ecx
+; X86-NEXT: andl %edi, (%esi,%ecx)
+; X86-NEXT: movl 8(%esi), %ebx
+; X86-NEXT: movl (%esi), %edi
+; X86-NEXT: movl 4(%esi), %ecx
+; X86-NEXT: movl 12(%esi), %ebp
+; X86-NEXT: orl 28(%esi), %ebp
+; X86-NEXT: orl 20(%esi), %ecx
+; X86-NEXT: orl %ebp, %ecx
+; X86-NEXT: orl 24(%esi), %ebx
+; X86-NEXT: movl 16(%esi), %ebp
+; X86-NEXT: orl %edi, %ebp
+; X86-NEXT: orl %ebx, %ebp
+; X86-NEXT: movl (%edx), %esi
+; X86-NEXT: movl %edi, (%edx)
+; X86-NEXT: movl (%eax), %eax
+; X86-NEXT: orl %ecx, %ebp
+; X86-NEXT: jne .LBB23_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: .LBB23_2:
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+;
+; X64-LABEL: chain_reset_i256:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
+; X64-NEXT: movl $-2, %eax
+; X64-NEXT: roll %cl, %eax
+; X64-NEXT: shrl $3, %ecx
+; X64-NEXT: andl $28, %ecx
+; X64-NEXT: andl %eax, (%rdi,%rcx)
+; X64-NEXT: movq (%rdi), %rcx
+; X64-NEXT: movq 8(%rdi), %r8
+; X64-NEXT: orq 24(%rdi), %r8
+; X64-NEXT: movq 16(%rdi), %rdi
+; X64-NEXT: orq %rcx, %rdi
+; X64-NEXT: movl (%rsi), %eax
+; X64-NEXT: movl %ecx, (%rsi)
+; X64-NEXT: movl (%rdx), %ecx
+; X64-NEXT: addl %ecx, %eax
+; X64-NEXT: orq %r8, %rdi
+; X64-NEXT: cmovnel %ecx, %eax
+; X64-NEXT: retq
+ %rem = and i32 %position, 255
+ %ofs = zext nneg i32 %rem to i256
+ %bit = shl nuw i256 1, %ofs
+ %ld0 = load i256, ptr %p0
+ %msk = xor i256 %bit, -1
+ %res = and i256 %ld0, %msk
+ store i256 %res, ptr %p0
+ %cmp = icmp ne i256 %res, 0
+ %ld1 = load i32, ptr %p1
+ %trunc = trunc i256 %res to i32
+ store i32 %trunc, ptr %p1
+ %ld2 = load i32, ptr %p2
+ %add = add i32 %ld1, %ld2
+ %sel = select i1 %cmp, i32 %ld2, i32 %add
+ ret i32 %sel
+}
+
+; BTC/BT/BTS sequence on same i128
+define i1 @sequence_i128(ptr %word, i32 %pos0, i32 %pos1, i32 %pos2) nounwind {
+; X86-LABEL: sequence_i128:
+; X86: # %bb.0:
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: subl $144, %esp
+; X86-NEXT: movb 20(%ebp), %ch
+; X86-NEXT: movb 12(%ebp), %cl
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -1842,36 +1119,80 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
; X86-NEXT: andb $12, %al
; X86-NEXT: negb %al
; X86-NEXT: movsbl %al, %eax
-; X86-NEXT: movl 40(%esp,%eax), %edx
-; X86-NEXT: movl 44(%esp,%eax), %esi
+; X86-NEXT: movl 56(%esp,%eax), %edx
+; X86-NEXT: movl 60(%esp,%eax), %esi
; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl 32(%esp,%eax), %edi
-; X86-NEXT: movl 36(%esp,%eax), %ebx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%esp,%eax), %edi
+; X86-NEXT: movl 52(%esp,%eax), %ebx
; X86-NEXT: shldl %cl, %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: shldl %cl, %edi, %ebx
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 16(%ebp), %eax
-; X86-NEXT: movl (%eax), %eax
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movb %ch, %al
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 84(%esp,%eax), %edx
+; X86-NEXT: movl 88(%esp,%eax), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movzbl 20(%ebp), %ecx
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 80(%esp,%eax), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 92(%esp,%eax), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl %cl, %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: andl %ebx, 4(%eax)
-; X86-NEXT: shll %cl, %edi
-; X86-NEXT: notl %edi
-; X86-NEXT: movl %ecx, %ebx
-; X86-NEXT: andl $96, %ebx
-; X86-NEXT: shrl $3, %ebx
-; X86-NEXT: movl (%eax,%ebx), %ebx
-; X86-NEXT: andl %edi, (%eax)
-; X86-NEXT: notl %esi
-; X86-NEXT: andl %esi, 12(%eax)
-; X86-NEXT: notl %edx
-; X86-NEXT: andl %edx, 8(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: btl %ecx, %ebx
-; X86-NEXT: jae .LBB22_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: .LBB22_2:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: xorl 8(%eax), %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: xorl 12(%eax), %esi
+; X86-NEXT: xorl (%eax), %edi
+; X86-NEXT: xorl 4(%eax), %ebx
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: # kill: def $al killed $al killed $eax
+; X86-NEXT: andb $96, %al
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movl 96(%esp,%eax), %eax
+; X86-NEXT: movl 16(%ebp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: setae %al
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl 8(%ebp), %ecx
+; X86-NEXT: movl %edx, 8(%ecx)
+; X86-NEXT: movl %esi, 12(%ecx)
+; X86-NEXT: movl %edi, (%ecx)
+; X86-NEXT: movl %ebx, 4(%ecx)
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
@@ -1879,96 +1200,129 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; SSE-LABEL: reset_multiload_i128:
+; SSE-LABEL: sequence_i128:
; SSE: # %bb.0:
+; SSE-NEXT: movl %ecx, %eax
; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: movl $1, %esi
-; SSE-NEXT: xorl %r8d, %r8d
-; SSE-NEXT: shldq %cl, %rsi, %r8
-; SSE-NEXT: xorl %eax, %eax
-; SSE-NEXT: shlq %cl, %rsi
+; SSE-NEXT: movl $1, %r8d
+; SSE-NEXT: xorl %esi, %esi
+; SSE-NEXT: shldq %cl, %r8, %rsi
+; SSE-NEXT: movl $1, %r9d
+; SSE-NEXT: shlq %cl, %r9
+; SSE-NEXT: xorl %r11d, %r11d
; SSE-NEXT: testb $64, %cl
-; SSE-NEXT: cmovneq %rsi, %r8
-; SSE-NEXT: cmovneq %rax, %rsi
-; SSE-NEXT: notq %r8
-; SSE-NEXT: notq %rsi
-; SSE-NEXT: movl %ecx, %r9d
-; SSE-NEXT: andl $96, %r9d
-; SSE-NEXT: shrl $3, %r9d
-; SSE-NEXT: movl (%rdi,%r9), %r9d
-; SSE-NEXT: btl %ecx, %r9d
-; SSE-NEXT: jb .LBB22_2
-; SSE-NEXT: # %bb.1:
-; SSE-NEXT: movl (%rdx), %eax
-; SSE-NEXT: .LBB22_2:
-; SSE-NEXT: andq %rsi, (%rdi)
-; SSE-NEXT: andq %r8, 8(%rdi)
-; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: cmovneq %r9, %rsi
+; SSE-NEXT: cmovneq %r11, %r9
+; SSE-NEXT: xorl %r10d, %r10d
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: shldq %cl, %r8, %r10
+; SSE-NEXT: shlq %cl, %r8
+; SSE-NEXT: testb $64, %al
+; SSE-NEXT: cmovneq %r8, %r10
+; SSE-NEXT: cmovneq %r11, %r8
+; SSE-NEXT: xorq 8(%rdi), %rsi
+; SSE-NEXT: xorq (%rdi), %r9
+; SSE-NEXT: movl %edx, %ecx
+; SSE-NEXT: andb $32, %cl
+; SSE-NEXT: movq %r9, %rax
+; SSE-NEXT: shrdq %cl, %rsi, %rax
+; SSE-NEXT: movq %rsi, %r11
+; SSE-NEXT: shrq %cl, %r11
+; SSE-NEXT: testb $64, %dl
+; SSE-NEXT: cmoveq %rax, %r11
+; SSE-NEXT: btl %edx, %r11d
+; SSE-NEXT: setae %al
+; SSE-NEXT: orq %r10, %rsi
+; SSE-NEXT: orq %r8, %r9
+; SSE-NEXT: movq %r9, (%rdi)
+; SSE-NEXT: movq %rsi, 8(%rdi)
; SSE-NEXT: retq
;
-; AVX2-LABEL: reset_multiload_i128:
+; AVX2-LABEL: sequence_i128:
; AVX2: # %bb.0:
+; AVX2-NEXT: movl %ecx, %eax
; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: movl $1, %r8d
+; AVX2-NEXT: xorl %r9d, %r9d
+; AVX2-NEXT: movl $1, %r10d
; AVX2-NEXT: xorl %esi, %esi
-; AVX2-NEXT: shldq %cl, %r8, %rsi
-; AVX2-NEXT: shlxq %rcx, %r8, %r8
+; AVX2-NEXT: shldq %cl, %r10, %rsi
+; AVX2-NEXT: shlxq %rcx, %r10, %r8
; AVX2-NEXT: testb $64, %cl
; AVX2-NEXT: cmovneq %r8, %rsi
-; AVX2-NEXT: cmovneq %rax, %r8
-; AVX2-NEXT: notq %rsi
-; AVX2-NEXT: notq %r8
-; AVX2-NEXT: movl %ecx, %r9d
-; AVX2-NEXT: andl $96, %r9d
-; AVX2-NEXT: shrl $3, %r9d
-; AVX2-NEXT: movl (%rdi,%r9), %r9d
-; AVX2-NEXT: btl %ecx, %r9d
-; AVX2-NEXT: jb .LBB22_2
-; AVX2-NEXT: # %bb.1:
-; AVX2-NEXT: movl (%rdx), %eax
-; AVX2-NEXT: .LBB22_2:
-; AVX2-NEXT: andq %r8, (%rdi)
-; AVX2-NEXT: andq %rsi, 8(%rdi)
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: cmovneq %r9, %r8
+; AVX2-NEXT: xorl %r11d, %r11d
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shldq %cl, %r10, %r11
+; AVX2-NEXT: shlxq %rax, %r10, %r10
+; AVX2-NEXT: testb $64, %al
+; AVX2-NEXT: cmovneq %r10, %r11
+; AVX2-NEXT: cmovneq %r9, %r10
+; AVX2-NEXT: xorq 8(%rdi), %rsi
+; AVX2-NEXT: xorq (%rdi), %r8
+; AVX2-NEXT: movl %edx, %ecx
+; AVX2-NEXT: andb $32, %cl
+; AVX2-NEXT: movq %r8, %rax
+; AVX2-NEXT: shrdq %cl, %rsi, %rax
+; AVX2-NEXT: shrxq %rcx, %rsi, %rcx
+; AVX2-NEXT: testb $64, %dl
+; AVX2-NEXT: cmoveq %rax, %rcx
+; AVX2-NEXT: btl %edx, %ecx
+; AVX2-NEXT: setae %al
+; AVX2-NEXT: orq %r11, %rsi
+; AVX2-NEXT: orq %r10, %r8
+; AVX2-NEXT: movq %r8, (%rdi)
+; AVX2-NEXT: movq %rsi, 8(%rdi)
; AVX2-NEXT: retq
;
-; AVX512-LABEL: reset_multiload_i128:
+; AVX512-LABEL: sequence_i128:
; AVX512: # %bb.0:
+; AVX512-NEXT: movl %ecx, %eax
; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: movl $1, %r8d
+; AVX512-NEXT: movl $1, %r9d
; AVX512-NEXT: xorl %esi, %esi
-; AVX512-NEXT: shldq %cl, %r8, %rsi
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: shlxq %rcx, %r8, %r8
+; AVX512-NEXT: shldq %cl, %r9, %rsi
+; AVX512-NEXT: xorl %r10d, %r10d
+; AVX512-NEXT: shlxq %rcx, %r9, %r8
; AVX512-NEXT: testb $64, %cl
; AVX512-NEXT: cmovneq %r8, %rsi
-; AVX512-NEXT: cmovneq %rax, %r8
-; AVX512-NEXT: notq %rsi
-; AVX512-NEXT: notq %r8
-; AVX512-NEXT: movl %ecx, %r9d
-; AVX512-NEXT: andl $96, %r9d
-; AVX512-NEXT: shrl $3, %r9d
-; AVX512-NEXT: movl (%rdi,%r9), %r9d
-; AVX512-NEXT: btl %ecx, %r9d
-; AVX512-NEXT: jb .LBB22_2
-; AVX512-NEXT: # %bb.1:
-; AVX512-NEXT: movl (%rdx), %eax
-; AVX512-NEXT: .LBB22_2:
-; AVX512-NEXT: andq %r8, (%rdi)
-; AVX512-NEXT: andq %rsi, 8(%rdi)
-; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT: cmovneq %r10, %r8
+; AVX512-NEXT: xorl %r11d, %r11d
+; AVX512-NEXT: movl %eax, %ecx
+; AVX512-NEXT: shldq %cl, %r9, %r11
+; AVX512-NEXT: shlxq %rax, %r9, %r9
+; AVX512-NEXT: testb $64, %al
+; AVX512-NEXT: cmovneq %r9, %r11
+; AVX512-NEXT: cmovneq %r10, %r9
+; AVX512-NEXT: xorq 8(%rdi), %rsi
+; AVX512-NEXT: xorq (%rdi), %r8
+; AVX512-NEXT: movl %edx, %ecx
+; AVX512-NEXT: andb $32, %cl
+; AVX512-NEXT: movq %r8, %rax
+; AVX512-NEXT: shrdq %cl, %rsi, %rax
+; AVX512-NEXT: shrxq %rcx, %rsi, %rcx
+; AVX512-NEXT: testb $64, %dl
+; AVX512-NEXT: cmoveq %rax, %rcx
+; AVX512-NEXT: btl %edx, %ecx
+; AVX512-NEXT: setae %al
+; AVX512-NEXT: orq %r11, %rsi
+; AVX512-NEXT: orq %r9, %r8
+; AVX512-NEXT: movq %r8, (%rdi)
+; AVX512-NEXT: movq %rsi, 8(%rdi)
; AVX512-NEXT: retq
- %rem = and i32 %position, 127
- %ofs = zext nneg i32 %rem to i128
- %bit = shl nuw i128 1, %ofs
- %mask = xor i128 %bit, -1
+ %rem0 = and i32 %pos0, 127
+ %rem1 = and i32 %pos1, 127
+ %rem2 = and i32 %pos2, 127
+ %ofs0 = zext nneg i32 %rem0 to i128
+ %ofs1 = zext nneg i32 %rem1 to i128
+ %ofs2 = zext nneg i32 %rem2 to i128
+ %bit0 = shl nuw i128 1, %ofs0
+ %bit1 = shl nuw i128 1, %ofs1
+ %bit2 = shl nuw i128 1, %ofs2
%ld = load i128, ptr %word
- %sel = load i32, ptr %p
- %test = and i128 %ld, %bit
- %res = and i128 %ld, %mask
- %cmp = icmp eq i128 %test, 0
- store i128 %res, ptr %word
- %ret = select i1 %cmp, i32 %sel, i32 0
- ret i32 %ret
+ %res0 = xor i128 %ld, %bit0
+ %test1 = and i128 %res0, %bit1
+ %cmp1 = icmp eq i128 %test1, 0
+ %res2 = or i128 %res0, %bit2
+ store i128 %res2, ptr %word
+ ret i1 %cmp1
}
diff --git a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
index f36baba..ab8498d 100644
--- a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
@@ -14,7 +14,6 @@ entry:
}
; CHECK: _ZL10myCallbacki:
-; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]:
define internal void @_ZL10myCallbacki(i32 %value) !type !2 {
entry:
%sink = alloca i32, align 4
@@ -33,6 +32,6 @@ entry:
;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0.
; CHECK-NEXT: .byte 1
;; Function Entry PC
-; CHECK-NEXT: .quad [[LABEL_FUNC]]
+; CHECK-NEXT: .quad _ZL10myCallbacki
;; Function type ID
; CHECK-NEXT: .quad -5212364466660467813
diff --git a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
index cdbad66..02d7107 100644
--- a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
@@ -11,7 +11,6 @@ declare !type !1 i32 @direct_bar(i8)
declare !type !2 ptr @direct_baz(ptr)
; CHECK: ball:
-; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]:
define ptr @ball() {
entry:
call void @direct_foo()
@@ -42,7 +41,7 @@ entry:
;; Flags
; CHECK-NEXT: .byte 7
;; Function Entry PC
-; CHECK-NEXT: .quad [[LABEL_FUNC]]
+; CHECK-NEXT: .quad ball
;; Function type ID -- set to 0 as no type metadata attached to function.
; CHECK-NEXT: .quad 0
;; Number of unique direct callees.
diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll
index feac3dc..30f1874 100644
--- a/llvm/test/CodeGen/X86/gfni-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-shifts.ll
@@ -1684,15 +1684,14 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; GFNIAVX512BW-LABEL: var_shl_v64i8:
; GFNIAVX512BW: # %bb.0:
-; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
+; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2
+; GFNIAVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm2
+; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; GFNIAVX512BW-NEXT: kmovq %rax, %k1
+; GFNIAVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
; GFNIAVX512BW-NEXT: retq
%shift = shl <64 x i8> %a, %b
ret <64 x i8> %shift
@@ -1876,15 +1875,15 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; GFNIAVX512BW-LABEL: var_lshr_v64i8:
; GFNIAVX512BW: # %bb.0:
-; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
+; GFNIAVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; GFNIAVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3
+; GFNIAVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm2
+; GFNIAVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2
+; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; GFNIAVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; GFNIAVX512BW-NEXT: kmovq %rax, %k1
+; GFNIAVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
; GFNIAVX512BW-NEXT: retq
%shift = lshr <64 x i8> %a, %b
ret <64 x i8> %shift
@@ -2232,36 +2231,16 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; GFNIAVX512BW-LABEL: var_ashr_v64i8:
; GFNIAVX512BW: # %bb.0:
-; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; GFNIAVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3
-; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
-; GFNIAVX512BW-NEXT: vpsraw $2, %zmm2, %zmm3
-; GFNIAVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm5
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm5, %k1
-; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
-; GFNIAVX512BW-NEXT: vpsraw $1, %zmm2, %zmm3
-; GFNIAVX512BW-NEXT: vpsllw $2, %zmm4, %zmm4
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
-; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
-; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; GFNIAVX512BW-NEXT: vpsraw $4, %zmm0, %zmm3
-; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT: vpsraw $2, %zmm0, %zmm3
-; GFNIAVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm4
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT: vpsraw $1, %zmm0, %zmm3
-; GFNIAVX512BW-NEXT: vpsllw $2, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
-; GFNIAVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2
+; GFNIAVX512BW-NEXT: vpsravw %zmm2, %zmm0, %zmm2
+; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
+; GFNIAVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; GFNIAVX512BW-NEXT: kmovq %rax, %k1
+; GFNIAVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1}
+; GFNIAVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; GFNIAVX512BW-NEXT: retq
%shift = ashr <64 x i8> %a, %b
ret <64 x i8> %shift
diff --git a/llvm/test/CodeGen/X86/pr166534.ll b/llvm/test/CodeGen/X86/pr166534.ll
new file mode 100644
index 0000000..aef44cc
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr166534.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE4
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512
+
+define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) {
+; SSE2-LABEL: pr166534:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: movq (%rdi), %rax
+; SSE2-NEXT: movq 8(%rdi), %r8
+; SSE2-NEXT: movdqu (%rdi), %xmm0
+; SSE2-NEXT: movq (%rsi), %r9
+; SSE2-NEXT: movq 8(%rsi), %rdi
+; SSE2-NEXT: movdqu (%rsi), %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT: pmovmskb %xmm1, %esi
+; SSE2-NEXT: xorl %r10d, %r10d
+; SSE2-NEXT: cmpl $65535, %esi # imm = 0xFFFF
+; SSE2-NEXT: sete %r10b
+; SSE2-NEXT: orq %r10, (%rdx)
+; SSE2-NEXT: cmpl $65535, %esi # imm = 0xFFFF
+; SSE2-NEXT: jne .LBB0_2
+; SSE2-NEXT: # %bb.1: # %if.then
+; SSE2-NEXT: xorq %r9, %rax
+; SSE2-NEXT: xorq %rdi, %r8
+; SSE2-NEXT: xorl %edx, %edx
+; SSE2-NEXT: orq %rax, %r8
+; SSE2-NEXT: sete %dl
+; SSE2-NEXT: orq %rdx, (%rcx)
+; SSE2-NEXT: .LBB0_2: # %if.end
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: pr166534:
+; SSE4: # %bb.0: # %entry
+; SSE4-NEXT: movq (%rdi), %rax
+; SSE4-NEXT: movq 8(%rdi), %r8
+; SSE4-NEXT: movdqu (%rdi), %xmm0
+; SSE4-NEXT: movq (%rsi), %r9
+; SSE4-NEXT: movq 8(%rsi), %rdi
+; SSE4-NEXT: movdqu (%rsi), %xmm1
+; SSE4-NEXT: pxor %xmm0, %xmm1
+; SSE4-NEXT: xorl %esi, %esi
+; SSE4-NEXT: ptest %xmm1, %xmm1
+; SSE4-NEXT: sete %sil
+; SSE4-NEXT: orq %rsi, (%rdx)
+; SSE4-NEXT: ptest %xmm1, %xmm1
+; SSE4-NEXT: jne .LBB0_2
+; SSE4-NEXT: # %bb.1: # %if.then
+; SSE4-NEXT: xorq %r9, %rax
+; SSE4-NEXT: xorq %rdi, %r8
+; SSE4-NEXT: xorl %edx, %edx
+; SSE4-NEXT: orq %rax, %r8
+; SSE4-NEXT: sete %dl
+; SSE4-NEXT: orq %rdx, (%rcx)
+; SSE4-NEXT: .LBB0_2: # %if.end
+; SSE4-NEXT: retq
+;
+; AVX2-LABEL: pr166534:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: movq (%rdi), %rax
+; AVX2-NEXT: movq 8(%rdi), %r8
+; AVX2-NEXT: vmovdqu (%rdi), %xmm0
+; AVX2-NEXT: movq (%rsi), %rdi
+; AVX2-NEXT: vpxor (%rsi), %xmm0, %xmm0
+; AVX2-NEXT: movq 8(%rsi), %rsi
+; AVX2-NEXT: xorl %r9d, %r9d
+; AVX2-NEXT: vptest %xmm0, %xmm0
+; AVX2-NEXT: sete %r9b
+; AVX2-NEXT: orq %r9, (%rdx)
+; AVX2-NEXT: vptest %xmm0, %xmm0
+; AVX2-NEXT: jne .LBB0_2
+; AVX2-NEXT: # %bb.1: # %if.then
+; AVX2-NEXT: xorq %rdi, %rax
+; AVX2-NEXT: xorq %rsi, %r8
+; AVX2-NEXT: xorl %edx, %edx
+; AVX2-NEXT: orq %rax, %r8
+; AVX2-NEXT: sete %dl
+; AVX2-NEXT: orq %rdx, (%rcx)
+; AVX2-NEXT: .LBB0_2: # %if.end
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: pr166534:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: movq (%rdi), %rax
+; AVX512-NEXT: movq 8(%rdi), %r8
+; AVX512-NEXT: vmovdqu (%rdi), %xmm0
+; AVX512-NEXT: movq (%rsi), %r9
+; AVX512-NEXT: movq 8(%rsi), %rdi
+; AVX512-NEXT: vpxor (%rsi), %xmm0, %xmm0
+; AVX512-NEXT: xorl %esi, %esi
+; AVX512-NEXT: vptest %xmm0, %xmm0
+; AVX512-NEXT: sete %sil
+; AVX512-NEXT: orq %rsi, (%rdx)
+; AVX512-NEXT: vptest %xmm0, %xmm0
+; AVX512-NEXT: jne .LBB0_2
+; AVX512-NEXT: # %bb.1: # %if.then
+; AVX512-NEXT: xorq %r9, %rax
+; AVX512-NEXT: xorq %rdi, %r8
+; AVX512-NEXT: xorl %edx, %edx
+; AVX512-NEXT: orq %rax, %r8
+; AVX512-NEXT: sete %dl
+; AVX512-NEXT: orq %rdx, (%rcx)
+; AVX512-NEXT: .LBB0_2: # %if.end
+; AVX512-NEXT: retq
+entry:
+ %a = load i128, ptr %pa, align 8
+ %b = load i128, ptr %pb, align 8
+ %cmp = icmp eq i128 %a, %b
+ %conv1 = zext i1 %cmp to i128
+ %c = load i128, ptr %pc, align 8
+ %or = or i128 %c, %conv1
+ store i128 %or, ptr %pc, align 8
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ %d = load i128, ptr %pd, align 8
+ %or7 = or i128 %d, %conv1
+ store i128 %or7, ptr %pd, align 8
+ br label %if.end
+
+if.end:
+ ret void
+}
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
index 0fb0420..aff2228 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -106,36 +106,16 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; AVX512BW-LABEL: var_shift_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; AVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3
-; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT: vpsraw $2, %zmm2, %zmm3
-; AVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm5
-; AVX512BW-NEXT: vpmovb2m %zmm5, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT: vpsraw $1, %zmm2, %zmm3
-; AVX512BW-NEXT: vpsllw $2, %zmm4, %zmm4
-; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; AVX512BW-NEXT: vpsraw $4, %zmm0, %zmm3
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
-; AVX512BW-NEXT: vpsraw $2, %zmm0, %zmm3
-; AVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm4
-; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
-; AVX512BW-NEXT: vpsraw $1, %zmm0, %zmm3
-; AVX512BW-NEXT: vpsllw $2, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
-; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
-; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2
+; AVX512BW-NEXT: vpsravw %zmm2, %zmm0, %zmm2
+; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; AVX512BW-NEXT: kmovq %rax, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
%shift = ashr <64 x i8> %a, %b
ret <64 x i8> %shift
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
index 103d570..4450d07 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
@@ -85,20 +85,14 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; AVX512BW-LABEL: var_shift_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm2
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm2
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm2
+; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; AVX512BW-NEXT: kmovq %rax, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
; AVX512BW-NEXT: retq
%shift = lshr <64 x i8> %a, %b
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
index efd7429..41238ac 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
@@ -82,19 +82,14 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; AVX512BW-LABEL: var_shift_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm2
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2
+; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm2
+; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; AVX512BW-NEXT: kmovq %rax, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
; AVX512BW-NEXT: retq
%shift = shl <64 x i8> %a, %b
ret <64 x i8> %shift
diff --git a/llvm/test/CodeGen/Xtensa/s32c1i.ll b/llvm/test/CodeGen/Xtensa/s32c1i.ll
new file mode 100644
index 0000000..aad738a
--- /dev/null
+++ b/llvm/test/CodeGen/Xtensa/s32c1i.ll
@@ -0,0 +1,7 @@
+; RUN: llc -mtriple=xtensa -mattr=+s32c1i -filetype=obj %s -o - | llvm-objdump --arch=xtensa --mattr=s32c1i -d - | FileCheck %s -check-prefix=XTENSA
+
+define i32 @constraint_i(i32 %a) {
+; XTENSA: 0: 22 e2 01 s32c1i a2, a2, 4
+ %res = tail call i32 asm "s32c1i $0, $1, $2", "=r,r,i"(i32 %a, i32 4)
+ ret i32 %res
+}