aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll199
-rw-r--r--llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll325
-rw-r--r--llvm/test/CodeGen/AMDGPU/readcyclecounter.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll236
-rw-r--r--llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir142
-rw-r--r--llvm/test/CodeGen/AMDGPU/wait-xcnt.mir176
-rw-r--r--llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll36
10 files changed, 924 insertions, 218 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll
index 1a7ccf0..588802c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
-; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
define amdgpu_kernel void @fcmp_uniform_select(float %a, i32 %b, i32 %c, ptr addrspace(1) %out) {
; GFX7-LABEL: fcmp_uniform_select:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir
index 67cc016..b6652f6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir
@@ -1,7 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx700 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX7 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GF8 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx700 -run-pass=instruction-select %s -o - | FileCheck -check-prefixes=GFX7 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass=instruction-select %s -o - | FileCheck -check-prefixes=GF8 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select %s -o - | FileCheck -check-prefixes=GFX11 %s
---
name: test_copy_scc_vcc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
index 02d0e52..6facdfd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
@@ -104,109 +104,110 @@ define amdgpu_cs <4 x i32> @abs_sgpr_v4i32(<4 x i32> inreg %arg) {
ret <4 x i32> %res
}
-define amdgpu_cs i16 @abs_vgpr_i16(i16 %arg) {
+define i16 @abs_vgpr_i16(i16 %arg) {
; GFX6-LABEL: abs_vgpr_i16:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0
; GFX6-NEXT: v_max_i32_e32 v0, v0, v1
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_i16:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_sub_u16_e32 v1, 0, v0
; GFX8-NEXT: v_max_i16_e32 v0, v0, v1
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_i16:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_sub_nc_u16 v1, 0, v0
; GFX10-NEXT: v_max_i16 v0, v0, v1
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_i16:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_sub_nc_u16 v1, 0, v0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_max_i16 v0, v0, v1
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
ret i16 %res
}
-define amdgpu_cs i32 @abs_vgpr_i32(i32 %arg) {
+define i32 @abs_vgpr_i32(i32 %arg) {
; GFX6-LABEL: abs_vgpr_i32:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0
; GFX6-NEXT: v_max_i32_e32 v0, v0, v1
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_i32:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 0, v0
; GFX8-NEXT: v_max_i32_e32 v0, v0, v1
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_i32:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0
; GFX10-NEXT: v_max_i32_e32 v0, v0, v1
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_sub_nc_u32_e32 v1, 0, v0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_max_i32_e32 v0, v0, v1
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call i32 @llvm.abs.i32(i32 %arg, i1 false)
ret i32 %res
}
-define amdgpu_cs i64 @abs_vgpr_i64(i64 %arg) {
+define i64 @abs_vgpr_i64(i64 %arg) {
; GFX6-LABEL: abs_vgpr_i64:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX6-NEXT: v_xor_b32_e32 v1, v1, v2
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_i64:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
; GFX8-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX8-NEXT: v_xor_b32_e32 v1, v1, v2
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_i64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
; GFX10-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX10-NEXT: v_xor_b32_e32 v1, v1, v2
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_mov_b32_e32 v3, v2
@@ -214,17 +215,15 @@ define amdgpu_cs i64 @abs_vgpr_i64(i64 %arg) {
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX1250-NEXT: v_xor_b32_e32 v1, v1, v2
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call i64 @llvm.abs.i64(i64 %arg, i1 false)
ret i64 %res
}
-define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
+define <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
; GFX6-LABEL: abs_vgpr_v4i32:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v0
; GFX6-NEXT: v_max_i32_e32 v0, v0, v4
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
@@ -233,14 +232,11 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
; GFX6-NEXT: v_max_i32_e32 v2, v2, v4
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
; GFX6-NEXT: v_max_i32_e32 v3, v3, v4
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: v_readfirstlane_b32 s2, v2
-; GFX6-NEXT: v_readfirstlane_b32 s3, v3
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_v4i32:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v0
; GFX8-NEXT: v_max_i32_e32 v0, v0, v4
; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v1
@@ -249,14 +245,11 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
; GFX8-NEXT: v_max_i32_e32 v2, v2, v4
; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v3
; GFX8-NEXT: v_max_i32_e32 v3, v3, v4
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
-; GFX8-NEXT: v_readfirstlane_b32 s2, v2
-; GFX8-NEXT: v_readfirstlane_b32 s3, v3
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_v4i32:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_sub_nc_u32_e32 v4, 0, v0
; GFX10-NEXT: v_sub_nc_u32_e32 v5, 0, v1
; GFX10-NEXT: v_sub_nc_u32_e32 v6, 0, v2
@@ -265,14 +258,12 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
; GFX10-NEXT: v_max_i32_e32 v1, v1, v5
; GFX10-NEXT: v_max_i32_e32 v2, v2, v6
; GFX10-NEXT: v_max_i32_e32 v3, v3, v7
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: v_readfirstlane_b32 s2, v2
-; GFX10-NEXT: v_readfirstlane_b32 s3, v3
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_v4i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_sub_nc_u32 v4, 0, v0 :: v_dual_sub_nc_u32 v5, 0, v1
; GFX1250-NEXT: v_dual_sub_nc_u32 v6, 0, v2 :: v_dual_sub_nc_u32 v7, 0, v3
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -281,13 +272,7 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_max_i32_e32 v2, v2, v6
; GFX1250-NEXT: v_max_i32_e32 v3, v3, v7
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_readfirstlane_b32 s2, v2
-; GFX1250-NEXT: v_readfirstlane_b32 s3, v3
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %arg, i1 false)
ret <4 x i32> %res
}
@@ -304,44 +289,43 @@ define amdgpu_cs <2 x i8> @abs_sgpr_v2i8(<2 x i8> inreg %arg) {
ret <2 x i8> %res
}
-define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
+define <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
; GFX6-LABEL: abs_vgpr_v2i8:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX6-NEXT: v_max_i32_e32 v0, v0, v2
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1
; GFX6-NEXT: v_max_i32_e32 v1, v1, v2
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_v2i8:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: v_sub_u16_sdwa v3, v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_sub_u16_sdwa v2, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_max_i16_sdwa v0, sext(v0), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX8-NEXT: v_max_i16_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_v2i8:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX10-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX10-NEXT: v_sub_nc_u16 v2, 0, v0
; GFX10-NEXT: v_sub_nc_u16 v3, 0, v1
; GFX10-NEXT: v_max_i16 v0, v0, v2
; GFX10-NEXT: v_max_i16 v1, v1, v3
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_v2i8:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -350,10 +334,7 @@ define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_max_i16 v0, v0, v2
; GFX1250-NEXT: v_max_i16 v1, v1, v3
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call <2 x i8> @llvm.abs.v2i8(<2 x i8> %arg, i1 false)
ret <2 x i8> %res
}
@@ -372,9 +353,10 @@ define amdgpu_cs <3 x i8> @abs_sgpr_v3i8(<3 x i8> inreg %arg) {
ret <3 x i8> %res
}
-define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
+define <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
; GFX6-LABEL: abs_vgpr_v3i8:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8
@@ -384,13 +366,11 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v2
; GFX6-NEXT: v_max_i32_e32 v2, v2, v3
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: v_readfirstlane_b32 s2, v2
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_v3i8:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: v_sub_u16_sdwa v4, v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_max_i16_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -398,13 +378,11 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
; GFX8-NEXT: v_sub_u16_sdwa v3, v3, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_max_i16_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX8-NEXT: v_max_i16_sdwa v2, sext(v2), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
-; GFX8-NEXT: v_readfirstlane_b32 s2, v2
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_v3i8:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX10-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX10-NEXT: v_bfe_i32 v2, v2, 0, 8
@@ -414,13 +392,12 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
; GFX10-NEXT: v_max_i16 v0, v0, v3
; GFX10-NEXT: v_max_i16 v1, v1, v4
; GFX10-NEXT: v_max_i16 v2, v2, v5
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: v_readfirstlane_b32 s2, v2
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_v3i8:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX1250-NEXT: v_bfe_i32 v2, v2, 0, 8
@@ -433,12 +410,7 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_max_i16 v1, v1, v4
; GFX1250-NEXT: v_max_i16 v2, v2, v5
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX1250-NEXT: v_readfirstlane_b32 s2, v2
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call <3 x i8> @llvm.abs.v3i8(<3 x i8> %arg, i1 false)
ret <3 x i8> %res
}
@@ -485,44 +457,44 @@ define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) {
ret <2 x i16> %res
}
-define amdgpu_cs <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) {
+define <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) {
; GFX6-LABEL: abs_vgpr_v2i16:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: v_max_i32_e32 v0, v0, v2
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1
; GFX6-NEXT: v_max_i32_e32 v1, v1, v2
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_v2i16:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: v_sub_u16_e32 v1, 0, v0
; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_max_i16_e32 v1, v0, v1
; GFX8-NEXT: v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_v2i16:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v1, 0, v0
; GFX10-NEXT: v_pk_max_i16 v0, v0, v1
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_v2i16:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_pk_sub_i16 v1, 0, v0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_pk_max_i16 v0, v0, v1
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %arg, i1 false)
ret <2 x i16> %res
}
@@ -576,9 +548,10 @@ define amdgpu_cs <3 x i16> @abs_sgpr_v3i16(<3 x i16> inreg %arg) {
ret <3 x i16> %res
}
-define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
+define <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
; GFX6-LABEL: abs_vgpr_v3i16:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
@@ -588,13 +561,11 @@ define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v2
; GFX6-NEXT: v_max_i32_e32 v2, v2, v3
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: v_readfirstlane_b32 s2, v2
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_v3i16:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: v_sub_u16_e32 v2, 0, v0
; GFX8-NEXT: v_sub_u16_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -603,31 +574,27 @@ define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
; GFX8-NEXT: v_max_i16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
; GFX8-NEXT: v_max_i16_e32 v1, v1, v4
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_v3i16:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v2, 0, v0
; GFX10-NEXT: v_sub_nc_u16 v3, 0, v1
; GFX10-NEXT: v_pk_max_i16 v0, v0, v2
; GFX10-NEXT: v_max_i16 v1, v1, v3
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_v3i16:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_pk_sub_i16 v2, 0, v0
; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_pk_max_i16 v0, v0, v2
; GFX1250-NEXT: v_max_i16 v1, v1, v3
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %arg, i1 false)
ret <3 x i16> %res
}
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll
new file mode 100644
index 0000000..05a0e39
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll
@@ -0,0 +1,325 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx1100 -passes=amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
+
+define amdgpu_kernel void @large_array_vectors_small_users(<16 x i8> %in, <16 x i8> %add, ptr addrspace(3) %out) #0 {
+; OPT-LABEL: define amdgpu_kernel void @large_array_vectors_small_users(
+; OPT-SAME: <16 x i8> [[IN:%.*]], <16 x i8> [[ADD:%.*]], ptr addrspace(3) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-NEXT: [[ENTRY:.*:]]
+; OPT-NEXT: [[ALLOCA:%.*]] = freeze <128 x i8> poison
+; OPT-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[IN]], i64 0
+; OPT-NEXT: [[TMP1:%.*]] = insertelement <128 x i8> [[ALLOCA]], i8 [[TMP0]], i32 0
+; OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT: [[TMP3:%.*]] = insertelement <128 x i8> [[TMP1]], i8 [[TMP2]], i32 1
+; OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT: [[TMP5:%.*]] = insertelement <128 x i8> [[TMP3]], i8 [[TMP4]], i32 2
+; OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT: [[TMP7:%.*]] = insertelement <128 x i8> [[TMP5]], i8 [[TMP6]], i32 3
+; OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT: [[TMP9:%.*]] = insertelement <128 x i8> [[TMP7]], i8 [[TMP8]], i32 4
+; OPT-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT: [[TMP11:%.*]] = insertelement <128 x i8> [[TMP9]], i8 [[TMP10]], i32 5
+; OPT-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT: [[TMP13:%.*]] = insertelement <128 x i8> [[TMP11]], i8 [[TMP12]], i32 6
+; OPT-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT: [[TMP15:%.*]] = insertelement <128 x i8> [[TMP13]], i8 [[TMP14]], i32 7
+; OPT-NEXT: [[TMP16:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT: [[TMP17:%.*]] = insertelement <128 x i8> [[TMP15]], i8 [[TMP16]], i32 8
+; OPT-NEXT: [[TMP18:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT: [[TMP19:%.*]] = insertelement <128 x i8> [[TMP17]], i8 [[TMP18]], i32 9
+; OPT-NEXT: [[TMP20:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT: [[TMP21:%.*]] = insertelement <128 x i8> [[TMP19]], i8 [[TMP20]], i32 10
+; OPT-NEXT: [[TMP22:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT: [[TMP23:%.*]] = insertelement <128 x i8> [[TMP21]], i8 [[TMP22]], i32 11
+; OPT-NEXT: [[TMP24:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT: [[TMP25:%.*]] = insertelement <128 x i8> [[TMP23]], i8 [[TMP24]], i32 12
+; OPT-NEXT: [[TMP26:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT: [[TMP27:%.*]] = insertelement <128 x i8> [[TMP25]], i8 [[TMP26]], i32 13
+; OPT-NEXT: [[TMP28:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT: [[TMP29:%.*]] = insertelement <128 x i8> [[TMP27]], i8 [[TMP28]], i32 14
+; OPT-NEXT: [[TMP30:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT: [[TMP31:%.*]] = insertelement <128 x i8> [[TMP29]], i8 [[TMP30]], i32 15
+; OPT-NEXT: [[TMP32:%.*]] = extractelement <16 x i8> [[IN]], i64 0
+; OPT-NEXT: [[TMP33:%.*]] = insertelement <128 x i8> [[TMP31]], i8 [[TMP32]], i32 0
+; OPT-NEXT: [[TMP34:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT: [[TMP35:%.*]] = insertelement <128 x i8> [[TMP33]], i8 [[TMP34]], i32 1
+; OPT-NEXT: [[TMP36:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT: [[TMP37:%.*]] = insertelement <128 x i8> [[TMP35]], i8 [[TMP36]], i32 2
+; OPT-NEXT: [[TMP38:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT: [[TMP39:%.*]] = insertelement <128 x i8> [[TMP37]], i8 [[TMP38]], i32 3
+; OPT-NEXT: [[TMP40:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT: [[TMP41:%.*]] = insertelement <128 x i8> [[TMP39]], i8 [[TMP40]], i32 4
+; OPT-NEXT: [[TMP42:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT: [[TMP43:%.*]] = insertelement <128 x i8> [[TMP41]], i8 [[TMP42]], i32 5
+; OPT-NEXT: [[TMP44:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT: [[TMP45:%.*]] = insertelement <128 x i8> [[TMP43]], i8 [[TMP44]], i32 6
+; OPT-NEXT: [[TMP46:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT: [[TMP47:%.*]] = insertelement <128 x i8> [[TMP45]], i8 [[TMP46]], i32 7
+; OPT-NEXT: [[TMP48:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT: [[TMP49:%.*]] = insertelement <128 x i8> [[TMP47]], i8 [[TMP48]], i32 8
+; OPT-NEXT: [[TMP50:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT: [[TMP51:%.*]] = insertelement <128 x i8> [[TMP49]], i8 [[TMP50]], i32 9
+; OPT-NEXT: [[TMP52:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT: [[TMP53:%.*]] = insertelement <128 x i8> [[TMP51]], i8 [[TMP52]], i32 10
+; OPT-NEXT: [[TMP54:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT: [[TMP55:%.*]] = insertelement <128 x i8> [[TMP53]], i8 [[TMP54]], i32 11
+; OPT-NEXT: [[TMP56:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT: [[TMP57:%.*]] = insertelement <128 x i8> [[TMP55]], i8 [[TMP56]], i32 12
+; OPT-NEXT: [[TMP58:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT: [[TMP59:%.*]] = insertelement <128 x i8> [[TMP57]], i8 [[TMP58]], i32 13
+; OPT-NEXT: [[TMP60:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT: [[TMP61:%.*]] = insertelement <128 x i8> [[TMP59]], i8 [[TMP60]], i32 14
+; OPT-NEXT: [[TMP62:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT: [[TMP63:%.*]] = insertelement <128 x i8> [[TMP61]], i8 [[TMP62]], i32 15
+; OPT-NEXT: [[TMP64:%.*]] = extractelement <16 x i8> [[IN]], i64 0
+; OPT-NEXT: [[TMP65:%.*]] = insertelement <128 x i8> [[TMP63]], i8 [[TMP64]], i32 0
+; OPT-NEXT: [[TMP66:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT: [[TMP67:%.*]] = insertelement <128 x i8> [[TMP65]], i8 [[TMP66]], i32 1
+; OPT-NEXT: [[TMP68:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT: [[TMP69:%.*]] = insertelement <128 x i8> [[TMP67]], i8 [[TMP68]], i32 2
+; OPT-NEXT: [[TMP70:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT: [[TMP71:%.*]] = insertelement <128 x i8> [[TMP69]], i8 [[TMP70]], i32 3
+; OPT-NEXT: [[TMP72:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT: [[TMP73:%.*]] = insertelement <128 x i8> [[TMP71]], i8 [[TMP72]], i32 4
+; OPT-NEXT: [[TMP74:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT: [[TMP75:%.*]] = insertelement <128 x i8> [[TMP73]], i8 [[TMP74]], i32 5
+; OPT-NEXT: [[TMP76:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT: [[TMP77:%.*]] = insertelement <128 x i8> [[TMP75]], i8 [[TMP76]], i32 6
+; OPT-NEXT: [[TMP78:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT: [[TMP79:%.*]] = insertelement <128 x i8> [[TMP77]], i8 [[TMP78]], i32 7
+; OPT-NEXT: [[TMP80:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT: [[TMP81:%.*]] = insertelement <128 x i8> [[TMP79]], i8 [[TMP80]], i32 8
+; OPT-NEXT: [[TMP82:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT: [[TMP83:%.*]] = insertelement <128 x i8> [[TMP81]], i8 [[TMP82]], i32 9
+; OPT-NEXT: [[TMP84:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT: [[TMP85:%.*]] = insertelement <128 x i8> [[TMP83]], i8 [[TMP84]], i32 10
+; OPT-NEXT: [[TMP86:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT: [[TMP87:%.*]] = insertelement <128 x i8> [[TMP85]], i8 [[TMP86]], i32 11
+; OPT-NEXT: [[TMP88:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT: [[TMP89:%.*]] = insertelement <128 x i8> [[TMP87]], i8 [[TMP88]], i32 12
+; OPT-NEXT: [[TMP90:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT: [[TMP91:%.*]] = insertelement <128 x i8> [[TMP89]], i8 [[TMP90]], i32 13
+; OPT-NEXT: [[TMP92:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT: [[TMP93:%.*]] = insertelement <128 x i8> [[TMP91]], i8 [[TMP92]], i32 14
+; OPT-NEXT: [[TMP94:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT: [[TMP95:%.*]] = insertelement <128 x i8> [[TMP93]], i8 [[TMP94]], i32 15
+; OPT-NEXT: [[TMP96:%.*]] = extractelement <16 x i8> [[IN]], i64 0
+; OPT-NEXT: [[TMP97:%.*]] = insertelement <128 x i8> [[TMP95]], i8 [[TMP96]], i32 0
+; OPT-NEXT: [[TMP98:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT: [[TMP99:%.*]] = insertelement <128 x i8> [[TMP97]], i8 [[TMP98]], i32 1
+; OPT-NEXT: [[TMP100:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT: [[TMP101:%.*]] = insertelement <128 x i8> [[TMP99]], i8 [[TMP100]], i32 2
+; OPT-NEXT: [[TMP102:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT: [[TMP103:%.*]] = insertelement <128 x i8> [[TMP101]], i8 [[TMP102]], i32 3
+; OPT-NEXT: [[TMP104:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT: [[TMP105:%.*]] = insertelement <128 x i8> [[TMP103]], i8 [[TMP104]], i32 4
+; OPT-NEXT: [[TMP106:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT: [[TMP107:%.*]] = insertelement <128 x i8> [[TMP105]], i8 [[TMP106]], i32 5
+; OPT-NEXT: [[TMP108:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT: [[TMP109:%.*]] = insertelement <128 x i8> [[TMP107]], i8 [[TMP108]], i32 6
+; OPT-NEXT: [[TMP110:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT: [[TMP111:%.*]] = insertelement <128 x i8> [[TMP109]], i8 [[TMP110]], i32 7
+; OPT-NEXT: [[TMP112:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT: [[TMP113:%.*]] = insertelement <128 x i8> [[TMP111]], i8 [[TMP112]], i32 8
+; OPT-NEXT: [[TMP114:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT: [[TMP115:%.*]] = insertelement <128 x i8> [[TMP113]], i8 [[TMP114]], i32 9
+; OPT-NEXT: [[TMP116:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT: [[TMP117:%.*]] = insertelement <128 x i8> [[TMP115]], i8 [[TMP116]], i32 10
+; OPT-NEXT: [[TMP118:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT: [[TMP119:%.*]] = insertelement <128 x i8> [[TMP117]], i8 [[TMP118]], i32 11
+; OPT-NEXT: [[TMP120:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT: [[TMP121:%.*]] = insertelement <128 x i8> [[TMP119]], i8 [[TMP120]], i32 12
+; OPT-NEXT: [[TMP122:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT: [[TMP123:%.*]] = insertelement <128 x i8> [[TMP121]], i8 [[TMP122]], i32 13
+; OPT-NEXT: [[TMP124:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT: [[TMP125:%.*]] = insertelement <128 x i8> [[TMP123]], i8 [[TMP124]], i32 14
+; OPT-NEXT: [[TMP126:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT: [[TMP127:%.*]] = insertelement <128 x i8> [[TMP125]], i8 [[TMP126]], i32 15
+; OPT-NEXT: [[TMP128:%.*]] = extractelement <16 x i8> [[IN]], i64 0
+; OPT-NEXT: [[TMP129:%.*]] = insertelement <128 x i8> [[TMP127]], i8 [[TMP128]], i32 0
+; OPT-NEXT: [[TMP130:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT: [[TMP131:%.*]] = insertelement <128 x i8> [[TMP129]], i8 [[TMP130]], i32 1
+; OPT-NEXT: [[TMP132:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT: [[TMP133:%.*]] = insertelement <128 x i8> [[TMP131]], i8 [[TMP132]], i32 2
+; OPT-NEXT: [[TMP134:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT: [[TMP135:%.*]] = insertelement <128 x i8> [[TMP133]], i8 [[TMP134]], i32 3
+; OPT-NEXT: [[TMP136:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT: [[TMP137:%.*]] = insertelement <128 x i8> [[TMP135]], i8 [[TMP136]], i32 4
+; OPT-NEXT: [[TMP138:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT: [[TMP139:%.*]] = insertelement <128 x i8> [[TMP137]], i8 [[TMP138]], i32 5
+; OPT-NEXT: [[TMP140:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT: [[TMP141:%.*]] = insertelement <128 x i8> [[TMP139]], i8 [[TMP140]], i32 6
+; OPT-NEXT: [[TMP142:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT: [[TMP143:%.*]] = insertelement <128 x i8> [[TMP141]], i8 [[TMP142]], i32 7
+; OPT-NEXT: [[TMP144:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT: [[TMP145:%.*]] = insertelement <128 x i8> [[TMP143]], i8 [[TMP144]], i32 8
+; OPT-NEXT: [[TMP146:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT: [[TMP147:%.*]] = insertelement <128 x i8> [[TMP145]], i8 [[TMP146]], i32 9
+; OPT-NEXT: [[TMP148:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT: [[TMP149:%.*]] = insertelement <128 x i8> [[TMP147]], i8 [[TMP148]], i32 10
+; OPT-NEXT: [[TMP150:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT: [[TMP151:%.*]] = insertelement <128 x i8> [[TMP149]], i8 [[TMP150]], i32 11
+; OPT-NEXT: [[TMP152:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT: [[TMP153:%.*]] = insertelement <128 x i8> [[TMP151]], i8 [[TMP152]], i32 12
+; OPT-NEXT: [[TMP154:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT: [[TMP155:%.*]] = insertelement <128 x i8> [[TMP153]], i8 [[TMP154]], i32 13
+; OPT-NEXT: [[TMP156:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT: [[TMP157:%.*]] = insertelement <128 x i8> [[TMP155]], i8 [[TMP156]], i32 14
+; OPT-NEXT: [[TMP158:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT: [[TMP159:%.*]] = insertelement <128 x i8> [[TMP157]], i8 [[TMP158]], i32 15
+; OPT-NEXT: [[TMP160:%.*]] = extractelement <16 x i8> [[IN]], i64 0
+; OPT-NEXT: [[TMP161:%.*]] = insertelement <128 x i8> [[TMP159]], i8 [[TMP160]], i32 0
+; OPT-NEXT: [[TMP162:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT: [[TMP163:%.*]] = insertelement <128 x i8> [[TMP161]], i8 [[TMP162]], i32 1
+; OPT-NEXT: [[TMP164:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT: [[TMP165:%.*]] = insertelement <128 x i8> [[TMP163]], i8 [[TMP164]], i32 2
+; OPT-NEXT: [[TMP166:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT: [[TMP167:%.*]] = insertelement <128 x i8> [[TMP165]], i8 [[TMP166]], i32 3
+; OPT-NEXT: [[TMP168:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT: [[TMP169:%.*]] = insertelement <128 x i8> [[TMP167]], i8 [[TMP168]], i32 4
+; OPT-NEXT: [[TMP170:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT: [[TMP171:%.*]] = insertelement <128 x i8> [[TMP169]], i8 [[TMP170]], i32 5
+; OPT-NEXT: [[TMP172:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT: [[TMP173:%.*]] = insertelement <128 x i8> [[TMP171]], i8 [[TMP172]], i32 6
+; OPT-NEXT: [[TMP174:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT: [[TMP175:%.*]] = insertelement <128 x i8> [[TMP173]], i8 [[TMP174]], i32 7
+; OPT-NEXT: [[TMP176:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT: [[TMP177:%.*]] = insertelement <128 x i8> [[TMP175]], i8 [[TMP176]], i32 8
+; OPT-NEXT: [[TMP178:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT: [[TMP179:%.*]] = insertelement <128 x i8> [[TMP177]], i8 [[TMP178]], i32 9
+; OPT-NEXT: [[TMP180:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT: [[TMP181:%.*]] = insertelement <128 x i8> [[TMP179]], i8 [[TMP180]], i32 10
+; OPT-NEXT: [[TMP182:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT: [[TMP183:%.*]] = insertelement <128 x i8> [[TMP181]], i8 [[TMP182]], i32 11
+; OPT-NEXT: [[TMP184:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT: [[TMP185:%.*]] = insertelement <128 x i8> [[TMP183]], i8 [[TMP184]], i32 12
+; OPT-NEXT: [[TMP186:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT: [[TMP187:%.*]] = insertelement <128 x i8> [[TMP185]], i8 [[TMP186]], i32 13
+; OPT-NEXT: [[TMP188:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT: [[TMP189:%.*]] = insertelement <128 x i8> [[TMP187]], i8 [[TMP188]], i32 14
+; OPT-NEXT: [[TMP190:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT: [[TMP191:%.*]] = insertelement <128 x i8> [[TMP189]], i8 [[TMP190]], i32 15
+; OPT-NEXT: [[TMP192:%.*]] = extractelement <16 x i8> [[IN]], i64 0
+; OPT-NEXT: [[TMP193:%.*]] = insertelement <128 x i8> [[TMP191]], i8 [[TMP192]], i32 0
+; OPT-NEXT: [[TMP194:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT: [[TMP195:%.*]] = insertelement <128 x i8> [[TMP193]], i8 [[TMP194]], i32 1
+; OPT-NEXT: [[TMP196:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT: [[TMP197:%.*]] = insertelement <128 x i8> [[TMP195]], i8 [[TMP196]], i32 2
+; OPT-NEXT: [[TMP198:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT: [[TMP199:%.*]] = insertelement <128 x i8> [[TMP197]], i8 [[TMP198]], i32 3
+; OPT-NEXT: [[TMP200:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT: [[TMP201:%.*]] = insertelement <128 x i8> [[TMP199]], i8 [[TMP200]], i32 4
+; OPT-NEXT: [[TMP202:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT: [[TMP203:%.*]] = insertelement <128 x i8> [[TMP201]], i8 [[TMP202]], i32 5
+; OPT-NEXT: [[TMP204:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT: [[TMP205:%.*]] = insertelement <128 x i8> [[TMP203]], i8 [[TMP204]], i32 6
+; OPT-NEXT: [[TMP206:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT: [[TMP207:%.*]] = insertelement <128 x i8> [[TMP205]], i8 [[TMP206]], i32 7
+; OPT-NEXT: [[TMP208:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT: [[TMP209:%.*]] = insertelement <128 x i8> [[TMP207]], i8 [[TMP208]], i32 8
+; OPT-NEXT: [[TMP210:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT: [[TMP211:%.*]] = insertelement <128 x i8> [[TMP209]], i8 [[TMP210]], i32 9
+; OPT-NEXT: [[TMP212:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT: [[TMP213:%.*]] = insertelement <128 x i8> [[TMP211]], i8 [[TMP212]], i32 10
+; OPT-NEXT: [[TMP214:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT: [[TMP215:%.*]] = insertelement <128 x i8> [[TMP213]], i8 [[TMP214]], i32 11
+; OPT-NEXT: [[TMP216:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT: [[TMP217:%.*]] = insertelement <128 x i8> [[TMP215]], i8 [[TMP216]], i32 12
+; OPT-NEXT: [[TMP218:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT: [[TMP219:%.*]] = insertelement <128 x i8> [[TMP217]], i8 [[TMP218]], i32 13
+; OPT-NEXT: [[TMP220:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT: [[TMP221:%.*]] = insertelement <128 x i8> [[TMP219]], i8 [[TMP220]], i32 14
+; OPT-NEXT: [[TMP222:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT: [[TMP223:%.*]] = insertelement <128 x i8> [[TMP221]], i8 [[TMP222]], i32 15
+; OPT-NEXT: [[TMP224:%.*]] = extractelement <16 x i8> [[IN]], i64 0
+; OPT-NEXT: [[TMP225:%.*]] = insertelement <128 x i8> [[TMP223]], i8 [[TMP224]], i32 0
+; OPT-NEXT: [[TMP226:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT: [[TMP227:%.*]] = insertelement <128 x i8> [[TMP225]], i8 [[TMP226]], i32 1
+; OPT-NEXT: [[TMP228:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT: [[TMP229:%.*]] = insertelement <128 x i8> [[TMP227]], i8 [[TMP228]], i32 2
+; OPT-NEXT: [[TMP230:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT: [[TMP231:%.*]] = insertelement <128 x i8> [[TMP229]], i8 [[TMP230]], i32 3
+; OPT-NEXT: [[TMP232:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT: [[TMP233:%.*]] = insertelement <128 x i8> [[TMP231]], i8 [[TMP232]], i32 4
+; OPT-NEXT: [[TMP234:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT: [[TMP235:%.*]] = insertelement <128 x i8> [[TMP233]], i8 [[TMP234]], i32 5
+; OPT-NEXT: [[TMP236:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT: [[TMP237:%.*]] = insertelement <128 x i8> [[TMP235]], i8 [[TMP236]], i32 6
+; OPT-NEXT: [[TMP238:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT: [[TMP239:%.*]] = insertelement <128 x i8> [[TMP237]], i8 [[TMP238]], i32 7
+; OPT-NEXT: [[TMP240:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT: [[TMP241:%.*]] = insertelement <128 x i8> [[TMP239]], i8 [[TMP240]], i32 8
+; OPT-NEXT: [[TMP242:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT: [[TMP243:%.*]] = insertelement <128 x i8> [[TMP241]], i8 [[TMP242]], i32 9
+; OPT-NEXT: [[TMP244:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT: [[TMP245:%.*]] = insertelement <128 x i8> [[TMP243]], i8 [[TMP244]], i32 10
+; OPT-NEXT: [[TMP246:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT: [[TMP247:%.*]] = insertelement <128 x i8> [[TMP245]], i8 [[TMP246]], i32 11
+; OPT-NEXT: [[TMP248:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT: [[TMP249:%.*]] = insertelement <128 x i8> [[TMP247]], i8 [[TMP248]], i32 12
+; OPT-NEXT: [[TMP250:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT: [[TMP251:%.*]] = insertelement <128 x i8> [[TMP249]], i8 [[TMP250]], i32 13
+; OPT-NEXT: [[TMP252:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT: [[TMP253:%.*]] = insertelement <128 x i8> [[TMP251]], i8 [[TMP252]], i32 14
+; OPT-NEXT: [[TMP254:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT: [[TMP255:%.*]] = insertelement <128 x i8> [[TMP253]], i8 [[TMP254]], i32 15
+; OPT-NEXT: [[TMP256:%.*]] = extractelement <128 x i8> [[TMP255]], i32 80
+; OPT-NEXT: [[TMP257:%.*]] = insertelement <16 x i8> poison, i8 [[TMP256]], i64 0
+; OPT-NEXT: [[TMP258:%.*]] = extractelement <128 x i8> [[TMP255]], i32 81
+; OPT-NEXT: [[TMP259:%.*]] = insertelement <16 x i8> [[TMP257]], i8 [[TMP258]], i64 1
+; OPT-NEXT: [[TMP260:%.*]] = extractelement <128 x i8> [[TMP255]], i32 82
+; OPT-NEXT: [[TMP261:%.*]] = insertelement <16 x i8> [[TMP259]], i8 [[TMP260]], i64 2
+; OPT-NEXT: [[TMP262:%.*]] = extractelement <128 x i8> [[TMP255]], i32 83
+; OPT-NEXT: [[TMP263:%.*]] = insertelement <16 x i8> [[TMP261]], i8 [[TMP262]], i64 3
+; OPT-NEXT: [[TMP264:%.*]] = extractelement <128 x i8> [[TMP255]], i32 84
+; OPT-NEXT: [[TMP265:%.*]] = insertelement <16 x i8> [[TMP263]], i8 [[TMP264]], i64 4
+; OPT-NEXT: [[TMP266:%.*]] = extractelement <128 x i8> [[TMP255]], i32 85
+; OPT-NEXT: [[TMP267:%.*]] = insertelement <16 x i8> [[TMP265]], i8 [[TMP266]], i64 5
+; OPT-NEXT: [[TMP268:%.*]] = extractelement <128 x i8> [[TMP255]], i32 86
+; OPT-NEXT: [[TMP269:%.*]] = insertelement <16 x i8> [[TMP267]], i8 [[TMP268]], i64 6
+; OPT-NEXT: [[TMP270:%.*]] = extractelement <128 x i8> [[TMP255]], i32 87
+; OPT-NEXT: [[TMP271:%.*]] = insertelement <16 x i8> [[TMP269]], i8 [[TMP270]], i64 7
+; OPT-NEXT: [[TMP272:%.*]] = extractelement <128 x i8> [[TMP255]], i32 88
+; OPT-NEXT: [[TMP273:%.*]] = insertelement <16 x i8> [[TMP271]], i8 [[TMP272]], i64 8
+; OPT-NEXT: [[TMP274:%.*]] = extractelement <128 x i8> [[TMP255]], i32 89
+; OPT-NEXT: [[TMP275:%.*]] = insertelement <16 x i8> [[TMP273]], i8 [[TMP274]], i64 9
+; OPT-NEXT: [[TMP276:%.*]] = extractelement <128 x i8> [[TMP255]], i32 90
+; OPT-NEXT: [[TMP277:%.*]] = insertelement <16 x i8> [[TMP275]], i8 [[TMP276]], i64 10
+; OPT-NEXT: [[TMP278:%.*]] = extractelement <128 x i8> [[TMP255]], i32 91
+; OPT-NEXT: [[TMP279:%.*]] = insertelement <16 x i8> [[TMP277]], i8 [[TMP278]], i64 11
+; OPT-NEXT: [[TMP280:%.*]] = extractelement <128 x i8> [[TMP255]], i32 92
+; OPT-NEXT: [[TMP281:%.*]] = insertelement <16 x i8> [[TMP279]], i8 [[TMP280]], i64 12
+; OPT-NEXT: [[TMP282:%.*]] = extractelement <128 x i8> [[TMP255]], i32 93
+; OPT-NEXT: [[TMP283:%.*]] = insertelement <16 x i8> [[TMP281]], i8 [[TMP282]], i64 13
+; OPT-NEXT: [[TMP284:%.*]] = extractelement <128 x i8> [[TMP255]], i32 94
+; OPT-NEXT: [[TMP285:%.*]] = insertelement <16 x i8> [[TMP283]], i8 [[TMP284]], i64 14
+; OPT-NEXT: [[TMP286:%.*]] = extractelement <128 x i8> [[TMP255]], i32 95
+; OPT-NEXT: [[TMP287:%.*]] = insertelement <16 x i8> [[TMP285]], i8 [[TMP286]], i64 15
+; OPT-NEXT: [[SUM:%.*]] = add <16 x i8> [[TMP287]], [[ADD]]
+; OPT-NEXT: store <16 x i8> [[SUM]], ptr addrspace(3) [[OUT]], align 16
+; OPT-NEXT: ret void
+;
+entry:
+ %alloca = alloca [8 x <16 x i8>], align 16, addrspace(5)
+ %gep0 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 0
+ store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
+ %gep1 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 1
+ store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
+ %gep2 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 2
+ store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
+ %gep3 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 3
+ store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
+ %gep4 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 4
+ store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
+ %gep5 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 5
+ store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
+ %gep6 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 6
+ store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
+ %gep7 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 7
+ store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
+ %load = load <16 x i8>, ptr addrspace(5) %gep5, align 16
+ %sum = add <16 x i8> %load, %add
+ store <16 x i8> %sum, ptr addrspace(3) %out, align 16
+ ret void
+}
+
+attributes #0 = {"amdgpu-waves-per-eu"="2,2"}
diff --git a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
index f67cbe3..ddb522a8 100644
--- a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
+++ b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
@@ -1,17 +1,17 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s
; -global-isel=1 SI run line skipped since store not yet implemented.
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=MEMTIME -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=MEMTIME -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=MEMTIME -check-prefix=GCN %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=MEMTIME -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=MEMTIME -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=MEMTIME -check-prefix=GCN %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-SDAG -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-GISEL -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-GISEL -check-prefix=GCN %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s
declare i64 @llvm.readcyclecounter() #0
diff --git a/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll b/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll
new file mode 100644
index 0000000..22e4a24
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll
@@ -0,0 +1,236 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+
+define i16 @v_underflow_compare_fold_i16(i16 %a, i16 %b) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sub_u16_e32 v1, v0, v1
+; GFX9-NEXT: v_min_u16_e32 v0, v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sub_nc_u16 v0.h, v0.l, v1.l
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_min_u16 v0.l, v0.h, v0.l
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %sub = sub i16 %a, %b
+ %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a)
+ ret i16 %cond
+}
+
+define i32 @v_underflow_compare_fold_i32(i32 %a, i32 %b) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sub_u32_e32 v1, v0, v1
+; GFX9-NEXT: v_min_u32_e32 v0, v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sub_nc_u32_e32 v1, v0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_min_u32_e32 v0, v1, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %sub = sub i32 %a, %b
+ %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+ ret i32 %cond
+}
+
+define i32 @v_underflow_compare_fold_i32_commute(i32 %a, i32 %b) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i32_commute:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sub_u32_e32 v1, v0, v1
+; GFX9-NEXT: v_min_u32_e32 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i32_commute:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sub_nc_u32_e32 v1, v0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_min_u32_e32 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %sub = sub i32 %a, %b
+ %cond = call i32 @llvm.umin.i32(i32 %a, i32 %sub)
+ ret i32 %cond
+}
+
+define i32 @v_underflow_compare_fold_i32_multi_use(i32 %a, i32 %b, ptr addrspace(1) %ptr) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i32_multi_use:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sub_u32_e32 v1, v0, v1
+; GFX9-NEXT: v_min_u32_e32 v0, v1, v0
+; GFX9-NEXT: global_store_dword v[2:3], v1, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i32_multi_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sub_nc_u32_e32 v1, v0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_min_u32_e32 v0, v1, v0
+; GFX11-NEXT: global_store_b32 v[2:3], v1, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %sub = sub i32 %a, %b
+ store i32 %sub, ptr addrspace(1) %ptr
+ %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+ ret i32 %cond
+}
+
+define i64 @v_underflow_compare_fold_i64(i64 %a, i64 %b) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_cndmask_b32 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %sub = sub i64 %a, %b
+ %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+ ret i64 %cond
+}
+
+define i64 @v_underflow_compare_fold_i64_commute(i64 %a, i64 %b) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i64_commute:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i64_commute:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %sub = sub i64 %a, %b
+ %cond = call i64 @llvm.umin.i64(i64 %a, i64 %sub)
+ ret i64 %cond
+}
+
+define i64 @v_underflow_compare_fold_i64_multi_use(i64 %a, i64 %b, ptr addrspace(1) %ptr) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i64_multi_use:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
+; GFX9-NEXT: global_store_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i64_multi_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT: global_store_b64 v[4:5], v[2:3], off
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_cndmask_b32 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %sub = sub i64 %a, %b
+ store i64 %sub, ptr addrspace(1) %ptr
+ %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+ ret i64 %cond
+}
+
+define amdgpu_ps i16 @s_underflow_compare_fold_i16(i16 inreg %a, i16 inreg %b) #0 {
+; GFX9-LABEL: s_underflow_compare_fold_i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_sub_i32 s1, s0, s1
+; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX9-NEXT: s_min_u32 s0, s1, s0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_underflow_compare_fold_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_sub_i32 s1, s0, s1
+; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX11-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_min_u32 s0, s1, s0
+; GFX11-NEXT: ; return to shader part epilog
+ %sub = sub i16 %a, %b
+ %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a)
+ ret i16 %cond
+}
+
+define amdgpu_ps i32 @s_underflow_compare_fold_i32(i32 inreg %a, i32 inreg %b) #0 {
+; GFX9-LABEL: s_underflow_compare_fold_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_sub_i32 s1, s0, s1
+; GFX9-NEXT: s_min_u32 s0, s1, s0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_underflow_compare_fold_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_sub_i32 s1, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_min_u32 s0, s1, s0
+; GFX11-NEXT: ; return to shader part epilog
+ %sub = sub i32 %a, %b
+ %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+ ret i32 %cond
+}
+
+define amdgpu_ps i64 @s_underflow_compare_fold_i64(i64 inreg %a, i64 inreg %b) #0 {
+; GFX9-LABEL: s_underflow_compare_fold_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_sub_u32 s2, s0, s2
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: s_subb_u32 s3, s1, s3
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s1, s3, s1
+; GFX9-NEXT: s_cselect_b32 s0, s2, s0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_underflow_compare_fold_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_sub_u32 s2, s0, s2
+; GFX11-NEXT: s_subb_u32 s3, s1, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[2:3], s[0:1]
+; GFX11-NEXT: s_and_b32 s4, s4, exec_lo
+; GFX11-NEXT: s_cselect_b32 s0, s2, s0
+; GFX11-NEXT: s_cselect_b32 s1, s3, s1
+; GFX11-NEXT: ; return to shader part epilog
+ %sub = sub i64 %a, %b
+ %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+ ret i64 %cond
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir
index 8a70a8a..32cc398 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir
@@ -36,7 +36,7 @@ body: |
; GCN-NEXT: v_add_f16_e64 v128.l /*v384.l*/, v129.l /*v385.l*/, v130.l /*v386.l*/
$vgpr384_lo16 = V_ADD_F16_t16_e64 0, undef $vgpr385_lo16, 0, undef $vgpr386_lo16, 0, 0, 0, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0x8a
+ ; GCN-NEXT: s_set_vgpr_msb 0x458a
; ASM-SAME: ; msbs: dst=2 src0=2 src1=2 src2=0
; GCN-NEXT: v_add_f16_e64 v0.h /*v512.h*/, v1.h /*v513.h*/, v2.h /*v514.h*/
$vgpr512_hi16 = V_ADD_F16_t16_e64 0, undef $vgpr513_hi16, 0, undef $vgpr514_hi16, 0, 0, 0, implicit $exec, implicit $mode
@@ -50,7 +50,7 @@ body: |
; GCN-NEXT: v_add_f16_e64 v128.l /*v640.l*/, v129.l /*v641.l*/, v130.l /*v642.l*/
$vgpr640_lo16 = V_ADD_F16_t16_e64 0, undef $vgpr641_lo16, 0, undef $vgpr642_lo16, 0, 0, 0, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0xcf
+ ; GCN-NEXT: s_set_vgpr_msb 0x8acf
; ASM-SAME: ; msbs: dst=3 src0=3 src1=3 src2=0
; GCN-NEXT: v_add_f16_e64 v0.h /*v768.h*/, v1.h /*v769.h*/, v2.h /*v770.h*/
$vgpr768_hi16 = V_ADD_F16_t16_e64 0, undef $vgpr769_hi16, 0, undef $vgpr770_hi16, 0, 0, 0, implicit $exec, implicit $mode
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir
index f508df2..7e1c28f 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir
@@ -22,13 +22,13 @@ body: |
$vgpr257 = V_MOV_B32_e32 undef $vgpr510, implicit $exec
; Single bit change
- ; GCN-NEXT: s_set_vgpr_msb 1
+ ; GCN-NEXT: s_set_vgpr_msb 0x4101
; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=0
; GCN-NEXT: v_rcp_f32_e64 v255, v2 /*v258*/
$vgpr255 = V_RCP_F32_e64 0, undef $vgpr258, 0, 0, implicit $exec, implicit $mode
; Reset
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x100
; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0
; GCN-NEXT: v_rcp_f32_e64 v255, v1
$vgpr255 = V_RCP_F32_e64 0, undef $vgpr1, 0, 0, implicit $exec, implicit $mode
@@ -40,7 +40,7 @@ body: |
; GCN-NEXT: v_add_nc_u32_e32 v0, v253 /*v509*/, v252 /*v508*/
$vgpr0 = V_ADD_U32_e32 undef $vgpr509, undef $vgpr508, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 0x44
+ ; GCN-NEXT: s_set_vgpr_msb 0x544
; ASM-SAME: ; msbs: dst=1 src0=0 src1=1 src2=0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_add_f32_e64 v2 /*v258*/, v0, v251 /*v507*/
@@ -48,7 +48,7 @@ body: |
; VOP3
- ; GCN-NEXT: s_set_vgpr_msb 0x55
+ ; GCN-NEXT: s_set_vgpr_msb 0x4455
; ASM-SAME: ; msbs: dst=1 src0=1 src1=1 src2=1
; GCN-NEXT: v_fma_f32 v3 /*v259*/, v4 /*v260*/, v5 /*v261*/, v6 /*v262*/
$vgpr259 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr261, 0, undef $vgpr262, 0, 0, implicit $exec, implicit $mode
@@ -58,32 +58,32 @@ body: |
$vgpr259 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr261, 0, undef $vgpr262, 0, 0, implicit $exec, implicit $mode
; Tuple crossing the 256 boundary
- ; GCN-NEXT: s_set_vgpr_msb 17
+ ; GCN-NEXT: s_set_vgpr_msb 0x5511
; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=1
; GCN-NEXT: v_mqsad_u32_u8 v[254:257], v[2:3] /*v[258:259]*/, v0, v[244:247] /*v[500:503]*/
$vgpr254_vgpr255_vgpr256_vgpr257 = V_MQSAD_U32_U8_e64 $vgpr258_vgpr259, $vgpr0, undef $vgpr500_vgpr501_vgpr502_vgpr503, 0, implicit $exec
; DPP/tied operand
- ; GCN-NEXT: s_set_vgpr_msb 0x45
+ ; GCN-NEXT: s_set_vgpr_msb 0x1145
; ASM-SAME: ; msbs: dst=1 src0=1 src1=1 src2=0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_add_nc_u16_e64_dpp v0 /*v256*/, v1 /*v257*/, v2 /*v258*/ quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
$vgpr256 = V_ADD_NC_U16_fake16_e64_dpp $vgpr256, 0, $vgpr257, 0, undef $vgpr258, 0, 0, 1, 15, 15, 1, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 17
+ ; GCN-NEXT: s_set_vgpr_msb 0x4511
; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=1
; GCN-NEXT: v_add3_u32_e64_dpp v0, v1 /*v257*/, v0, v2 /*v258*/ quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
$vgpr0 = V_ADD3_U32_e64_dpp $vgpr0, $vgpr257, $vgpr0, undef $vgpr258, 1, 15, 15, 1, implicit $exec
; DS (addr, data0, and data1 operands)
- ; GCN-NEXT: s_set_vgpr_msb 20
+ ; GCN-NEXT: s_set_vgpr_msb 0x1114
; ASM-SAME: ; msbs: dst=0 src0=0 src1=1 src2=1
; GCN-NEXT: ds_store_2addr_b32 v0, v248 /*v504*/, v249 /*v505*/ offset1:1
DS_WRITE2_B32_gfx9 $vgpr0, undef $vgpr504, undef $vgpr505, 0, 1, 0, implicit $exec
; Reset
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x1400
; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0
; GCN-NEXT: ds_store_2addr_b32 v0, v248, v249 offset1:1
DS_WRITE2_B32_gfx9 $vgpr0, undef $vgpr248, undef $vgpr249, 0, 1, 0, implicit $exec
@@ -93,13 +93,13 @@ body: |
; GCN-NEXT: ds_load_b32 v0, v255 /*v511*/
$vgpr0 = DS_READ_B32_gfx9 $vgpr511, 0, 0, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 0x44
+ ; GCN-NEXT: s_set_vgpr_msb 0x144
; ASM-SAME: ; msbs: dst=1 src0=0 src1=1 src2=0
; GCN-NEXT: ds_add_rtn_u32 v255 /*v511*/, v0, v248 /*v504*/
$vgpr511 = DS_ADD_RTN_U32_gfx9 $vgpr0, undef $vgpr504, 0, 0, implicit $exec
; Reset
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4400
; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0
; GCN-NEXT: ds_add_rtn_u32 v0, v0, v0
$vgpr0 = DS_ADD_RTN_U32_gfx9 $vgpr0, $vgpr0, 0, 0, implicit $exec
@@ -111,17 +111,17 @@ body: |
; GCN-NEXT: global_load_b32 v2, v[2:3] /*v[258:259]*/, off
$vgpr2 = GLOBAL_LOAD_DWORD undef $vgpr258_vgpr259, 0, 0, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 64
+ ; GCN-NEXT: s_set_vgpr_msb 0x140
; ASM-SAME: ; msbs: dst=1 src0=0 src1=0 src2=0
; GCN-NEXT: global_load_b32 v255 /*v511*/, v0, s[0:1]
$vgpr511 = GLOBAL_LOAD_DWORD_SADDR undef $sgpr0_sgpr1, $vgpr0, 0, 0, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 1
+ ; GCN-NEXT: s_set_vgpr_msb 0x4001
; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=0
; GCN-NEXT: scratch_load_u8 v0, v255 /*v511*/, s0
$vgpr0 = SCRATCH_LOAD_UBYTE_SVS $vgpr511, undef $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x100
; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0
; GCN-NEXT: global_store_b32 v[0:1], v2, off
GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
@@ -135,13 +135,13 @@ body: |
; GCN-NEXT: global_store_b96 v[0:1] /*v[256:257]*/, v[244:246] /*v[500:502]*/, off
GLOBAL_STORE_DWORDX3 $vgpr256_vgpr257, $vgpr500_vgpr501_vgpr502, 0, 0, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 0x44
+ ; GCN-NEXT: s_set_vgpr_msb 0x544
; ASM-SAME: ; msbs: dst=1 src0=0 src1=1 src2=0
; GCN-NEXT: flat_atomic_add_u32 v254 /*v510*/, v[0:1], v255 /*v511*/ th:TH_ATOMIC_RETURN
$vgpr510 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr511, 0, 1, implicit $exec, implicit $flat_scr
; Reset
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4400
; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0
; GCN-NEXT: flat_atomic_add_u32 v0, v[0:1], v255 th:TH_ATOMIC_RETURN
$vgpr0 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr255, 0, 1, implicit $exec, implicit $flat_scr
@@ -156,12 +156,12 @@ body: |
; GCN-NEXT: buffer_load_b32 v1 /*v257*/, v0, s[8:11], s3 offen
$vgpr257 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN $vgpr0, undef $sgpr8_sgpr9_sgpr10_sgpr11, undef $sgpr3, 0, 0, 0, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 0x41
+ ; GCN-NEXT: s_set_vgpr_msb 0x4041
; ASM-SAME: ; msbs: dst=1 src0=1 src1=0 src2=0
; GCN-NEXT: buffer_load_b32 v1 /*v257*/, v0 /*v256*/, s[8:11], s3 offen
$vgpr257 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN $vgpr256, undef $sgpr8_sgpr9_sgpr10_sgpr11, undef $sgpr3, 0, 0, 0, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4100
; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0
; GCN-NEXT: buffer_store_b32 v0, v1, s[0:3], s3 offen
BUFFER_STORE_DWORD_VBUFFER_OFFEN $vgpr0, $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, 0, implicit $exec
@@ -171,7 +171,7 @@ body: |
; GCN-NEXT: buffer_store_b32 v0 /*v256*/, v1 /*v257*/, s[0:3], s3 offen
BUFFER_STORE_DWORD_VBUFFER_OFFEN $vgpr256, $vgpr257, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, 0, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4100
; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0
; GCN-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s3 offen
BUFFER_ATOMIC_ADD_F32_VBUFFER_OFFEN $vgpr0, $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, implicit $exec
@@ -183,44 +183,44 @@ body: |
; VGPRs above 512
- ; GCN-NEXT: s_set_vgpr_msb 0xaa
+ ; GCN-NEXT: s_set_vgpr_msb 0x41aa
; ASM-SAME: ; msbs: dst=2 src0=2 src1=2 src2=2
; GCN-NEXT: v_fma_f32 v0 /*v512*/, v1 /*v513*/, v2 /*v514*/, v3 /*v515*/
$vgpr512 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr514, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0xab
+ ; GCN-NEXT: s_set_vgpr_msb 0xaaab
; ASM-SAME: ; msbs: dst=2 src0=3 src1=2 src2=2
; GCN-NEXT: v_fma_f32 v0 /*v512*/, v0 /*v768*/, v2 /*v514*/, v3 /*v515*/
$vgpr512 = V_FMA_F32_e64 0, undef $vgpr768, 0, undef $vgpr514, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0xae
+ ; GCN-NEXT: s_set_vgpr_msb 0xabae
; ASM-SAME: ; msbs: dst=2 src0=2 src1=3 src2=2
; GCN-NEXT: v_fma_f32 v0 /*v512*/, v1 /*v513*/, v2 /*v770*/, v3 /*v515*/
$vgpr512 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr770, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0xba
+ ; GCN-NEXT: s_set_vgpr_msb 0xaeba
; ASM-SAME: ; msbs: dst=2 src0=2 src1=2 src2=3
; GCN-NEXT: v_fma_f32 v0 /*v512*/, v1 /*v513*/, v2 /*v514*/, v3 /*v771*/
$vgpr512 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr514, 0, undef $vgpr771, 0, 0, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0xea
+ ; GCN-NEXT: s_set_vgpr_msb 0xbaea
; ASM-SAME: ; msbs: dst=3 src0=2 src1=2 src2=2
; GCN-NEXT: v_fma_f32 v255 /*v1023*/, v1 /*v513*/, v2 /*v514*/, v3 /*v515*/
$vgpr1023 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr514, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0xff
+ ; GCN-NEXT: s_set_vgpr_msb 0xeaff
; ASM-SAME: ; msbs: dst=3 src0=3 src1=3 src2=3
; GCN-NEXT: v_fma_f32 v0 /*v768*/, v1 /*v769*/, v2 /*v770*/, v3 /*v771*/
$vgpr768 = V_FMA_F32_e64 0, undef $vgpr769, 0, undef $vgpr770, 0, undef $vgpr771, 0, 0, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0x42
+ ; GCN-NEXT: s_set_vgpr_msb 0xff42
; ASM-SAME: ; msbs: dst=1 src0=2 src1=0 src2=0
; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v0 /*v512*/
$vgpr256 = V_MOV_B32_e32 undef $vgpr512, implicit $exec
; Reset
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4200
; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0
; GCN-NEXT: v_fma_f32 v0, v1, v2, v3
$vgpr0 = V_FMA_F32_e64 0, undef $vgpr1, 0, undef $vgpr2, 0, undef $vgpr3, 0, 0, implicit $exec, implicit $mode
@@ -232,12 +232,12 @@ body: |
; GCN-NEXT: global_store_b96 v[0:1] /*v[512:513]*/, v[0:2] /*v[512:514]*/, off
GLOBAL_STORE_DWORDX3 $vgpr512_vgpr513, $vgpr512_vgpr513_vgpr514, 0, 0, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 11
+ ; GCN-NEXT: s_set_vgpr_msb 0xa0b
; ASM-SAME: ; msbs: dst=0 src0=3 src1=2 src2=0
; GCN-NEXT: global_store_b64 v[254:255] /*v[1022:1023]*/, v[254:255] /*v[766:767]*/, off
GLOBAL_STORE_DWORDX2 $vgpr1022_vgpr1023, $vgpr766_vgpr767, 0, 0, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 0x55
+ ; GCN-NEXT: s_set_vgpr_msb 0xb55
; ASM-SAME: ; msbs: dst=1 src0=1 src1=1 src2=1
; GCN-NEXT: v_wmma_f32_16x16x32_bf16 v[14:21] /*v[270:277]*/, v[26:33] /*v[282:289]*/, v[34:41] /*v[290:297]*/, v[14:21] /*v[270:277]*/
early-clobber $vgpr270_vgpr271_vgpr272_vgpr273_vgpr274_vgpr275_vgpr276_vgpr277 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, undef $vgpr282_vgpr283_vgpr284_vgpr285_vgpr286_vgpr287_vgpr288_vgpr289, 8, undef $vgpr290_vgpr291_vgpr292_vgpr293_vgpr294_vgpr295_vgpr296_vgpr297, 8, killed undef $vgpr270_vgpr271_vgpr272_vgpr273_vgpr274_vgpr275_vgpr276_vgpr277, 0, 0, 0, 0, implicit $exec
@@ -247,6 +247,7 @@ body: |
...
# ASM-LABEL: {{^}}vopd:
+
# DIS-LABEL: <vopd>:
---
name: vopd
@@ -262,35 +263,35 @@ body: |
; GCN-NEXT: v_dual_sub_f32 v244 /*v500*/, v1, v2 :: v_dual_mul_f32 v0 /*v256*/, v3, v4
$vgpr500, $vgpr256 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr2, undef $vgpr3, undef $vgpr4, implicit $mode, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 0x41
+ ; GCN-NEXT: s_set_vgpr_msb 0x4041
; GCN-NEXT: v_dual_sub_f32 v244 /*v500*/, s1, v2 :: v_dual_mul_f32 v0 /*v256*/, v44 /*v300*/, v4
$vgpr500, $vgpr256 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $sgpr1, undef $vgpr2, undef $vgpr300, undef $vgpr4, implicit $mode, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 4
+ ; GCN-NEXT: s_set_vgpr_msb 0x4104
; GCN-NEXT: v_dual_sub_f32 v255, v1, v44 /*v300*/ :: v_dual_mul_f32 v6, v0, v1 /*v257*/
$vgpr255, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr300, undef $vgpr0, $vgpr257, implicit $mode, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 1
+ ; GCN-NEXT: s_set_vgpr_msb 0x401
; GCN-NEXT: v_dual_sub_f32 v255, 0, v1 :: v_dual_mul_f32 v6, v44 /*v300*/, v3
$vgpr255, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 0, undef $vgpr1, undef $vgpr300, undef $vgpr3, implicit $mode, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 64
+ ; GCN-NEXT: s_set_vgpr_msb 0x140
; GCN-NEXT: v_dual_fmamk_f32 v243 /*v499*/, v0, 0xa, v3 :: v_dual_fmac_f32 v0 /*v256*/, v1, v1
$vgpr499, $vgpr256 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx1250 undef $vgpr0, 10, undef $vgpr3, undef $vgpr1, undef $vgpr1, $vgpr256, implicit $mode, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 5
+ ; GCN-NEXT: s_set_vgpr_msb 0x4005
; GCN-NEXT: v_dual_mov_b32 v2, v3 /*v259*/ :: v_dual_add_f32 v3, v1 /*v257*/, v2 /*v258*/
$vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx1250 undef $vgpr259, undef $vgpr257, undef $vgpr258, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0x44
+ ; GCN-NEXT: s_set_vgpr_msb 0x544
; GCN-NEXT: v_dual_fmamk_f32 v244 /*v500*/, v0, 0xa, v44 /*v300*/ :: v_dual_fmac_f32 v3 /*v259*/, v1, v1 /*v257*/
$vgpr500, $vgpr259 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx1250 undef $vgpr0, 10, undef $vgpr300, undef $vgpr1, undef $vgpr257, $vgpr259, implicit $mode, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 16
+ ; GCN-NEXT: s_set_vgpr_msb 0x4410
; GCN-NEXT: v_dual_fma_f32 v0, v6, v6, v44 /*v300*/ :: v_dual_fma_f32 v1, v4, v5, v45 /*v301*/
$vgpr0, $vgpr1 = V_DUAL_FMA_F32_e64_X_FMA_F32_e64_e96_gfx1250 0, undef $vgpr6, 0, undef $vgpr6, 0, undef $vgpr300, 0, undef $vgpr4, 0, undef $vgpr5, 0, undef $vgpr301, implicit $mode, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x1000
; GCN-NEXT: v_dual_fmac_f32 v2, v6, v6 :: v_dual_fma_f32 v3, v4, v5, v3
$vgpr2, $vgpr3 = V_DUAL_FMAC_F32_e32_X_FMA_F32_e64_e96_gfx1250 0, undef $vgpr6, 0, undef $vgpr6, undef $vgpr2, 0, undef $vgpr4, 0, undef $vgpr5, 0, $vgpr3, implicit $mode, implicit $exec
@@ -298,7 +299,7 @@ body: |
; GCN-NEXT: v_dual_fma_f32 v244 /*v500*/, v6, v7, v8 :: v_dual_add_f32 v3 /*v259*/, v4, v5
$vgpr500, $vgpr259 = V_DUAL_FMA_F32_e64_X_ADD_F32_e32_e96_gfx1250 0, undef $vgpr6, 0, undef $vgpr7, 0, undef $vgpr8, 0, undef $vgpr4, 0, undef $vgpr5, implicit $mode, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 0xae
+ ; GCN-NEXT: s_set_vgpr_msb 0x40ae
; GCN-NEXT: v_dual_fmac_f32 v2 /*v514*/, v6 /*v518*/, v8 /*v776*/ :: v_dual_fma_f32 v3 /*v515*/, v4 /*v516*/, v7 /*v775*/, v3 /*v515*/
$vgpr514, $vgpr515 = V_DUAL_FMAC_F32_e32_X_FMA_F32_e64_e96_gfx1250 0, undef $vgpr518, 0, undef $vgpr776, undef $vgpr514, 0, undef $vgpr516, 0, undef $vgpr775, 0, $vgpr515, implicit $mode, implicit $exec
@@ -319,31 +320,31 @@ body: |
; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v1 /*v257*/, v2 /*v258*/, 0x1
$vgpr256 = V_FMAAK_F32 undef $vgpr257, undef $vgpr258, 1, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 5
+ ; GCN-NEXT: s_set_vgpr_msb 0x4505
; GCN-NEXT: v_fmaak_f32 v0, v1 /*v257*/, v2 /*v258*/, 0x1
$vgpr0 = V_FMAAK_F32 undef $vgpr257, undef $vgpr258, 1, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0x41
+ ; GCN-NEXT: s_set_vgpr_msb 0x541
; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v1 /*v257*/, v2, 0x1
$vgpr256 = V_FMAAK_F32 undef $vgpr257, undef $vgpr2, 1, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0x44
+ ; GCN-NEXT: s_set_vgpr_msb 0x4144
; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v1, v2 /*v258*/, 0x1
$vgpr256 = V_FMAAK_F32 undef $vgpr1, undef $vgpr258, 1, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0x45
+ ; GCN-NEXT: s_set_vgpr_msb 0x4445
; GCN-NEXT: v_fmamk_f32 v0 /*v256*/, v1 /*v257*/, 0x1, v2 /*v258*/
$vgpr256 = V_FMAMK_F32 undef $vgpr257, 1, undef $vgpr258, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 5
+ ; GCN-NEXT: s_set_vgpr_msb 0x4505
; GCN-NEXT: v_fmamk_f32 v0, v1 /*v257*/, 0x1, v2 /*v258*/
$vgpr0 = V_FMAMK_F32 undef $vgpr257, 1, undef $vgpr258, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0x41
+ ; GCN-NEXT: s_set_vgpr_msb 0x541
; GCN-NEXT: v_fmamk_f32 v0 /*v256*/, v1 /*v257*/, 0x1, v2
$vgpr256 = V_FMAMK_F32 undef $vgpr257, 1, undef $vgpr2, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0x44
+ ; GCN-NEXT: s_set_vgpr_msb 0x4144
; GCN-NEXT: v_fmamk_f32 v0 /*v256*/, v1, 0x1, v2 /*v258*/
$vgpr256 = V_FMAMK_F32 undef $vgpr1, 1, undef $vgpr258, implicit $exec, implicit $mode
@@ -389,15 +390,15 @@ body: |
; GCN-NEXT: v_lshlrev_b32_e64 v0, v0 /*v256*/, v2
$vgpr0 = V_LSHLREV_B32_e64 undef $vgpr256, undef $vgpr2, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 4
+ ; GCN-NEXT: s_set_vgpr_msb 0x104
; GCN-NEXT: v_lshlrev_b32_e64 v0, v1, v0 /*v256*/
$vgpr0 = V_LSHLREV_B32_e64 undef $vgpr1, undef $vgpr256, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 1
+ ; GCN-NEXT: s_set_vgpr_msb 0x401
; GCN-NEXT: v_subrev_nc_u32_e32 v0, v0 /*v256*/, v2
$vgpr0 = V_SUBREV_U32_e32 undef $vgpr256, undef $vgpr2, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 4
+ ; GCN-NEXT: s_set_vgpr_msb 0x104
; GCN-NEXT: v_subrev_nc_u32_e32 v0, v1, v0 /*v256*/
$vgpr0 = V_SUBREV_U32_e32 undef $vgpr1, undef $vgpr256, implicit $exec
@@ -417,7 +418,7 @@ body: |
; GCN-NEXT: v_fma_f32 v3 /*v259*/, v4 /*v260*/, v5 /*v261*/, v6 /*v262*/
$vgpr259 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr261, 0, undef $vgpr262, 0, 0, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x5500
; GCN-NEXT: v_add_nc_u32_e32 v0, v1, v2
$vgpr0 = V_ADD_U32_e32 undef $vgpr1, undef $vgpr2, implicit $exec
@@ -431,7 +432,7 @@ body: |
; GCN-NEXT: v_add_nc_u32_e32 v0 /*v256*/, v1, v2
$vgpr256 = V_ADD_U32_e32 undef $vgpr1, undef $vgpr2, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4000
; GCN-NEXT: v_fma_f32 v3, v4, v5, s2
$vgpr3 = V_FMA_F32_e64 0, undef $vgpr4, 0, undef $vgpr5, 0, undef $sgpr2, 0, 0, implicit $exec, implicit $mode
@@ -439,17 +440,17 @@ body: |
; GCN-NEXT: v_fma_f32 v3, v4 /*v260*/, v5, 1
$vgpr3 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr5, 0, 1, 0, 0, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 4
+ ; GCN-NEXT: s_set_vgpr_msb 0x104
; GCN-NEXT: v_mov_b32_e32 v0, v1
$vgpr0 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
; GCN-NEXT: v_add_nc_u32_e32 v2, v1, v3 /*v259*/
$vgpr2 = V_ADD_U32_e32 undef $vgpr1, undef $vgpr259, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 1
+ ; GCN-NEXT: s_set_vgpr_msb 0x401
; GCN-NEXT: v_mov_b32_e32 v0, v0 /*v256*/
; GCN-NEXT: v_add_nc_u32_e32 v1, v1 /*v257*/, v1
- ; GCN-NEXT: s_set_vgpr_msb 5
+ ; GCN-NEXT: s_set_vgpr_msb 0x105
; GCN-NEXT: v_add_nc_u32_e32 v2, v2 /*v258*/, v2 /*v258*/
$vgpr0 = V_MOV_B32_e32 undef $vgpr256, implicit $exec
$vgpr1 = V_ADD_U32_e32 undef $vgpr257, undef $vgpr1, implicit $exec
@@ -478,16 +479,18 @@ body: |
; ASM: .LBB{{.*_1}}:
; GCN-NEXT: s_set_vgpr_msb 64
; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
+ ; GCN-NEXT: s_set_vgpr_msb 0x4000
$vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
- ; No mode switch on fall through
+ ; Reset on fallthrough block end
bb.2:
; ASM-NEXT: %bb.2:
- ; GCN-NEXT: s_nop 0
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 64
+ ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
+ ; GCN-NEXT: s_set_vgpr_msb 0x4000
; GCN-NEXT: s_branch
- S_NOP 0
+ $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
S_BRANCH %bb.3
; Reset mode on terminator
@@ -496,7 +499,7 @@ body: |
; ASM: .LBB{{.*_3}}:
; GCN-NEXT: s_set_vgpr_msb 64
; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4000
; GCN-NEXT: s_swap_pc_i64
$vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
$exec = S_SWAPPC_B64 undef $sgpr0_sgpr1
@@ -518,7 +521,7 @@ body: |
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_set_vgpr_msb 64
; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4000
; GCN-NEXT: s_set_pc_i64
$vgpr0 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
$vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
@@ -538,7 +541,7 @@ body: |
; ASM-NEXT: %bb.7:
; GCN-NEXT: s_set_vgpr_msb 64
; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4000
; ASM-NEXT: ; return to shader part epilog
$vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
SI_RETURN_TO_EPILOG undef $vgpr0, implicit-def $exec
@@ -556,7 +559,7 @@ body: |
; ASM-NEXT: %bb.9:
; GCN-NEXT: s_set_vgpr_msb 64
; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4000
; GCN-NEXT: s_set_pc_i64
$vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
S_SETPC_B64_return undef $sgpr0_sgpr1, implicit-def $exec
@@ -574,13 +577,14 @@ body: |
; ASM: %bb.0:
; GCN-NEXT: s_set_vgpr_msb 64
; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4000
$vgpr256 = V_MOV_B32_e32 undef $vgpr0, implicit $exec
bb.1:
; ASM: .LBB{{[0-9]+}}_1:
; GCN-NEXT: s_set_vgpr_msb 64
; GCN-NEXT: v_mov_b32_e32 v1 /*v257*/, v1
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4000
; GCN-NEXT: s_cbranch_scc0
$vgpr257 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
S_CBRANCH_SCC0 %bb.1, undef implicit $scc
@@ -604,7 +608,7 @@ body: |
; ASM: %bb.0:
; GCN-NEXT: s_set_vgpr_msb 64
; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4000
; ASM: def v0
; GCN-NOT: s_set_vgpr_msb
; ASM: use v0
@@ -638,7 +642,7 @@ body: |
; GCN-NEXT: s_set_vgpr_msb 64
; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
; GCN-NEXT: s_nop 0
- ; GCN-NEXT: s_set_vgpr_msb 1
+ ; GCN-NEXT: s_set_vgpr_msb 0x4001
; GCN-NEXT: v_mov_b32_e32 v1, v0 /*v256*/
BUNDLE implicit-def $vgpr256 {
$vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
@@ -680,7 +684,7 @@ body: |
; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
; GCN-NEXT: v_mov_b32_e32 v1 /*v257*/, v1
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4000
; GCN-NEXT: v_mov_b32_e32 v2, v1
; GCN-NEXT: v_mov_b32_e32 v3, v1
BUNDLE implicit-def $vgpr256, implicit-def $vgpr257, implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr1 {
@@ -709,7 +713,7 @@ body: |
; GCN-NEXT: s_clause 0x3e
; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4000
; GCN-NEXT: v_mov_b32_e32 v1, v1
; GCN-NEXT: v_mov_b32_e32 v2, v1
; GCN-COUNT-60: v_mov_b32_e32 v1, v1
@@ -823,7 +827,7 @@ body: |
; GCN-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[210:217], v[244:259] /*v[500:515]*/, v[244:259] /*v[500:515]*/, v[10:17], v1, v2
$vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr1, undef $vgpr2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x500
; GCN-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[210:217], v[100:115], v[100:115], v[10:17], v1, v2
$vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr1, undef $vgpr2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
@@ -835,11 +839,11 @@ body: |
; GCN-NEXT: v_wmma_ld_scale16_paired_b64 v[0:1], v[2:3]
V_WMMA_LD_SCALE16_PAIRED_B64 undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 5
+ ; GCN-NEXT: s_set_vgpr_msb 0x105
; GCN-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[210:217], v[244:259] /*v[500:515]*/, v[244:259] /*v[500:515]*/, v[10:17], v[0:1], v[2:3]
$vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x500
; GCN-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[210:217], v[100:115], v[100:115], v[10:17], v[0:1], v[2:3]
$vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
index 1b8e126..a1381ec 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
@@ -945,7 +945,6 @@ body: |
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
...
-# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0.
---
name: wait_kmcnt_with_outstanding_vmem_2
tracksRegLiveness: true
@@ -971,6 +970,7 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: S_WAIT_KMCNT 0
; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
+ ; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
bb.0:
liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
@@ -986,6 +986,180 @@ body: |
...
---
+name: wait_kmcnt_and_wait_loadcnt
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ ; GCN-LABEL: name: wait_kmcnt_and_wait_loadcnt
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: liveins: $sgpr2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: S_WAIT_KMCNT 0
+ ; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
+ ; GCN-NEXT: S_WAIT_LOADCNT 0
+ ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+ bb.0:
+ liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+ $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ S_CBRANCH_SCC1 %bb.2, implicit $scc
+ bb.1:
+ liveins: $vgpr0_vgpr1, $sgpr2
+ $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ bb.2:
+ liveins: $sgpr2
+ $sgpr2 = S_MOV_B32 $sgpr2
+ $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+...
+
+---
+name: implicit_handling_of_pending_vmem_group
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ ; GCN-LABEL: name: implicit_handling_of_pending_vmem_group
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: S_WAIT_KMCNT 0
+ ; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
+ ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: S_WAIT_XCNT 0
+ ; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr0
+ bb.0:
+ liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+ $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ S_CBRANCH_SCC1 %bb.2, implicit $scc
+ bb.1:
+ liveins: $vgpr0_vgpr1, $sgpr2
+ $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ bb.2:
+ liveins: $sgpr0_sgpr1, $sgpr2
+ $sgpr2 = S_MOV_B32 $sgpr2
+ $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ $sgpr0 = S_MOV_B32 $sgpr0
+...
+
+---
+name: pending_vmem_event_between_block
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ ; GCN-LABEL: name: pending_vmem_event_between_block
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: S_WAIT_KMCNT 0
+ ; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
+ ; GCN-NEXT: S_WAIT_XCNT 1
+ ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: S_WAIT_XCNT 0
+ ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr0
+ bb.0:
+ liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+ $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ S_CBRANCH_SCC1 %bb.2, implicit $scc
+ bb.1:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2
+ $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
+ bb.2:
+ liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2
+ $sgpr2 = S_MOV_B32 $sgpr2
+ $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+ $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+ $sgpr0 = S_MOV_B32 $sgpr0
+...
+
+---
+name: flushing_vmem_cnt_on_block_entry
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ ; GCN-LABEL: name: flushing_vmem_cnt_on_block_entry
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: S_WAIT_XCNT 0
+ ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr0
+ bb.0:
+ liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+ $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ S_CBRANCH_SCC1 %bb.2, implicit $scc
+ bb.1:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2
+ $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
+ bb.2:
+ liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2
+ $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+ $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+ $sgpr0 = S_MOV_B32 $sgpr0
+...
+
+---
name: wait_loadcnt_with_outstanding_smem
tracksRegLiveness: true
machineFunctionInfo:
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
index a42c8ac7..7581710 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
@@ -3182,7 +3182,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v509*/, s33 offset:1592
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v510*/, s33 offset:1596
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v511*/, s33 offset:1600
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 8 ; msbs: dst=0 src0=0 src1=2 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x408 ; msbs: dst=0 src0=0 src1=2 src2=0
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v512*/, s33 offset:1604
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v513*/, s33 offset:1608
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v514*/, s33 offset:1612
@@ -3443,7 +3443,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v765*/, s33 offset:2616
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v766*/, s33 offset:2620
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v767*/, s33 offset:2624
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 12 ; msbs: dst=0 src0=0 src1=3 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c ; msbs: dst=0 src0=0 src1=3 src2=0
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v768*/, s33 offset:2628
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v769*/, s33 offset:2632
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v770*/, s33 offset:2636
@@ -3706,7 +3706,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v1023*/, s33 offset:3648
; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, -1
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc00 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-DAGISEL-NEXT: v_writelane_b32 v40, s0, 3
@@ -4135,7 +4135,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v509*/, off, s33 offset:1592
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v510*/, off, s33 offset:1596
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v511*/, off, s33 offset:1600
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80 ; msbs: dst=2 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x4080 ; msbs: dst=2 src0=0 src1=0 src2=0
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v512*/, off, s33 offset:1604
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v513*/, off, s33 offset:1608
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v514*/, off, s33 offset:1612
@@ -4396,7 +4396,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v765*/, off, s33 offset:2616
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v766*/, off, s33 offset:2620
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v767*/, off, s33 offset:2624
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc0 ; msbs: dst=3 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c0 ; msbs: dst=3 src0=0 src1=0 src2=0
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v768*/, off, s33 offset:2628
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v769*/, off, s33 offset:2632
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v770*/, off, s33 offset:2636
@@ -4661,7 +4661,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, s4
; GFX1250-DAGISEL-NEXT: s_mov_b32 s33, s0
; GFX1250-DAGISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc000 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-DAGISEL-NEXT: s_set_pc_i64 s[30:31]
%ret = call amdgpu_gfx <2 x half>(<2 x half>, <2 x half>) @gfx_callee(<2 x half> %y, <2 x half> %x) convergent
ret <2 x half> %ret
@@ -6346,7 +6346,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v509*/, s32 offset:1588
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v510*/, s32 offset:1592
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v511*/, s32 offset:1596
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 8 ; msbs: dst=0 src0=0 src1=2 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x408 ; msbs: dst=0 src0=0 src1=2 src2=0
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v512*/, s32 offset:1600
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v513*/, s32 offset:1604
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v514*/, s32 offset:1608
@@ -6607,7 +6607,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v765*/, s32 offset:2612
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v766*/, s32 offset:2616
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v767*/, s32 offset:2620
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 12 ; msbs: dst=0 src0=0 src1=3 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c ; msbs: dst=0 src0=0 src1=3 src2=0
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v768*/, s32 offset:2624
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v769*/, s32 offset:2628
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v770*/, s32 offset:2632
@@ -6872,7 +6872,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, -1
; GFX1250-DAGISEL-NEXT: v_mov_b32_e32 v2, v0
; GFX1250-DAGISEL-NEXT: s_mov_b64 s[36:37], gfx_callee@abs64
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc00 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-DAGISEL-NEXT: v_swap_b32 v0, v1
; GFX1250-DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1
; GFX1250-DAGISEL-NEXT: s_clause 0x3e
@@ -7283,7 +7283,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v509*/, off, s32 offset:1588
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v510*/, off, s32 offset:1592
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v511*/, off, s32 offset:1596
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80 ; msbs: dst=2 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x4080 ; msbs: dst=2 src0=0 src1=0 src2=0
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v512*/, off, s32 offset:1600
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v513*/, off, s32 offset:1604
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v514*/, off, s32 offset:1608
@@ -7544,7 +7544,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v765*/, off, s32 offset:2612
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v766*/, off, s32 offset:2616
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v767*/, off, s32 offset:2620
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc0 ; msbs: dst=3 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c0 ; msbs: dst=3 src0=0 src1=0 src2=0
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v768*/, off, s32 offset:2624
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v769*/, off, s32 offset:2628
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v770*/, off, s32 offset:2632
@@ -7807,7 +7807,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v1023*/, off, s32 offset:3644
; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, s0
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc000 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-DAGISEL-NEXT: s_set_pc_i64 s[36:37]
%ret = tail call amdgpu_gfx <2 x half>(<2 x half>, <2 x half>) @gfx_callee(<2 x half> %y, <2 x half> %x) convergent
ret <2 x half> %ret
@@ -9657,7 +9657,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v509*/, s33 offset:1600
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v510*/, s33 offset:1604
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v511*/, s33 offset:1608
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 8 ; msbs: dst=0 src0=0 src1=2 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x408 ; msbs: dst=0 src0=0 src1=2 src2=0
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v512*/, s33 offset:1612
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v513*/, s33 offset:1616
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v514*/, s33 offset:1620
@@ -9918,7 +9918,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v765*/, s33 offset:2624
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v766*/, s33 offset:2628
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v767*/, s33 offset:2632
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 12 ; msbs: dst=0 src0=0 src1=3 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c ; msbs: dst=0 src0=0 src1=3 src2=0
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v768*/, s33 offset:2636
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v769*/, s33 offset:2640
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v770*/, s33 offset:2644
@@ -10181,7 +10181,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v1023*/, s33 offset:3656
; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, -1
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc00 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-DAGISEL-NEXT: s_clause 0x2
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v42, s33
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v40, s33 offset:164
@@ -10616,7 +10616,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v509*/, off, s33 offset:1600
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v510*/, off, s33 offset:1604
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v511*/, off, s33 offset:1608
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80 ; msbs: dst=2 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x4080 ; msbs: dst=2 src0=0 src1=0 src2=0
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v512*/, off, s33 offset:1612
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v513*/, off, s33 offset:1616
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v514*/, off, s33 offset:1620
@@ -10877,7 +10877,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v765*/, off, s33 offset:2624
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v766*/, off, s33 offset:2628
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v767*/, off, s33 offset:2632
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc0 ; msbs: dst=3 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c0 ; msbs: dst=3 src0=0 src1=0 src2=0
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v768*/, off, s33 offset:2636
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v769*/, off, s33 offset:2640
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v770*/, off, s33 offset:2644
@@ -11142,7 +11142,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, s4
; GFX1250-DAGISEL-NEXT: s_mov_b32 s33, s0
; GFX1250-DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc000 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-DAGISEL-NEXT: s_set_pc_i64 s[30:31]
%ret = call float(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @callee, <8 x float> %x) convergent
store float %ret, ptr %p