aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen')
-rw-r--r--llvm/test/CodeGen/AArch64/stackmap.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll165
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll199
-rw-r--r--llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll325
-rw-r--r--llvm/test/CodeGen/AMDGPU/readcyclecounter.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll236
-rw-r--r--llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir142
-rw-r--r--llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll36
-rw-r--r--llvm/test/CodeGen/Hexagon/isel-fclass.ll86
-rw-r--r--llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll63
-rw-r--r--llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll200
-rw-r--r--llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll63
-rw-r--r--llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll88
-rw-r--r--llvm/test/CodeGen/LoongArch/sink-fold-addi.ll758
-rw-r--r--llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt22
-rw-r--r--llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt22
-rw-r--r--llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll2
-rw-r--r--llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll4
-rw-r--r--llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll4
-rw-r--r--llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll4
-rw-r--r--llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll4
-rw-r--r--llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll4
-rw-r--r--llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll68
-rw-r--r--llvm/test/CodeGen/RISCV/rv64-stackmap.ll116
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll253
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/pr165232.ll244
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/zvlsseg-spill.mir12
-rw-r--r--llvm/test/CodeGen/SPIRV/hlsl-resources/issue-146942-ptr-cast.ll4
-rw-r--r--llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll22
-rw-r--r--llvm/test/CodeGen/SystemZ/stackmap.ll4
-rw-r--r--llvm/test/CodeGen/X86/amx-tf32-internal.ll7
-rw-r--r--llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll12
-rwxr-xr-xllvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll122
-rw-r--r--llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll136
-rw-r--r--llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir165
-rw-r--r--llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir153
-rw-r--r--llvm/test/CodeGen/X86/amx_tile_pair_copy.mir97
-rw-r--r--llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll87
-rw-r--r--llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll61
-rw-r--r--llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir134
-rw-r--r--llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir113
-rw-r--r--llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll371
-rw-r--r--llvm/test/CodeGen/X86/bittest-big-integer.ll7103
-rw-r--r--llvm/test/CodeGen/X86/ipra-reg-usage.ll4
-rw-r--r--llvm/test/CodeGen/X86/pr165755.ll26
48 files changed, 8847 insertions, 2926 deletions
diff --git a/llvm/test/CodeGen/AArch64/stackmap.ll b/llvm/test/CodeGen/AArch64/stackmap.ll
index 995d254..26221d0 100644
--- a/llvm/test/CodeGen/AArch64/stackmap.ll
+++ b/llvm/test/CodeGen/AArch64/stackmap.ll
@@ -81,14 +81,14 @@
; CHECK-NEXT: .hword 8
; CHECK-NEXT: .hword 0
; CHECK-NEXT: .hword 0
-; CHECK-NEXT: .word 65535
+; CHECK-NEXT: .word -1
; SmallConstant
; CHECK-NEXT: .byte 4
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .hword 8
; CHECK-NEXT: .hword 0
; CHECK-NEXT: .hword 0
-; CHECK-NEXT: .word 65535
+; CHECK-NEXT: .word -1
; SmallConstant
; CHECK-NEXT: .byte 4
; CHECK-NEXT: .byte 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll
new file mode 100644
index 0000000..e440bee
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll
@@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-FAKE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-TRUE16 %s
+
+define amdgpu_ps half @fadd_s16_uniform(half inreg %a, half inreg %b) {
+; GFX11-FAKE16-LABEL: fadd_s16_uniform:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s0, s1
+; GFX11-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: fadd_s16_uniform:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s0, s1
+; GFX11-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: fadd_s16_uniform:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_add_f16 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
+ %fadd = fadd half %a, %b
+ ret half %fadd
+}
+
+define amdgpu_ps half @fadd_s16_div(half %a, half %b) {
+; GFX11-FAKE16-LABEL: fadd_s16_div:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: fadd_s16_div:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: fadd_s16_div:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
+; GFX12-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: fadd_s16_div:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-TRUE16-NEXT: ; return to shader part epilog
+ %fadd = fadd half %a, %b
+ ret half %fadd
+}
+
+define amdgpu_ps float @fadd_s32_uniform(float inreg %a, float inreg %b) {
+; GFX11-LABEL: fadd_s32_uniform:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_add_f32_e64 v0, s0, s1
+; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: fadd_s32_uniform:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_add_f32 s0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
+ %fadd = fadd float %a, %b
+ ret float %fadd
+}
+
+define amdgpu_ps float @fadd_s32_div(float %a, float %b) {
+; GCN-LABEL: fadd_s32_div:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_add_f32_e32 v0, v0, v1
+; GCN-NEXT: ; return to shader part epilog
+ %fadd = fadd float %a, %b
+ ret float %fadd
+}
+
+define amdgpu_ps void @fadd_s64_uniform(double inreg %a, double inreg %b, ptr addrspace(1) %ptr) {
+; GFX11-LABEL: fadd_s64_uniform:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_add_f64 v[2:3], s[0:1], s[2:3]
+; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: fadd_s64_uniform:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_add_f64_e64 v[2:3], s[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT: s_endpgm
+ %fadd = fadd double %a, %b
+ store double %fadd, ptr addrspace(1) %ptr
+ ret void
+}
+
+define amdgpu_ps void @fadd_s64_div(double %a, double %b, ptr addrspace(1) %ptr) {
+; GFX11-LABEL: fadd_s64_div:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT: global_store_b64 v[4:5], v[0:1], off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: fadd_s64_div:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: global_store_b64 v[4:5], v[0:1], off
+; GFX12-NEXT: s_endpgm
+ %fadd = fadd double %a, %b
+ store double %fadd, ptr addrspace(1) %ptr
+ ret void
+}
+
+define amdgpu_ps <2 x half> @fadd_v2s16_uniform(<2 x half> inreg %a, <2 x half> inreg %b) {
+; GFX11-LABEL: fadd_v2s16_uniform:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_pk_add_f16 v0, s0, s1
+; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: fadd_v2s16_uniform:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_lshr_b32 s2, s0, 16
+; GFX12-NEXT: s_lshr_b32 s3, s1, 16
+; GFX12-NEXT: s_add_f16 s0, s0, s1
+; GFX12-NEXT: s_add_f16 s1, s2, s3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: ; return to shader part epilog
+ %fadd = fadd <2 x half> %a, %b
+ ret <2 x half> %fadd
+}
+
+define amdgpu_ps <2 x half> @fadd_v2s16_div(<2 x half> %a, <2 x half> %b) {
+; GCN-LABEL: fadd_v2s16_div:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_f16 v0, v0, v1
+; GCN-NEXT: ; return to shader part epilog
+ %fadd = fadd <2 x half> %a, %b
+ ret <2 x half> %fadd
+}
+
+define amdgpu_ps <2 x float> @fadd_v2s32_uniform(<2 x float> inreg %a, <2 x float> inreg %b) {
+; GFX11-LABEL: fadd_v2s32_uniform:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_add_f32_e64 v0, s0, s2
+; GFX11-NEXT: v_add_f32_e64 v1, s1, s3
+; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: fadd_v2s32_uniform:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_add_f32 s0, s0, s2
+; GFX12-NEXT: s_add_f32 s1, s1, s3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: ; return to shader part epilog
+ %fadd = fadd <2 x float> %a, %b
+ ret <2 x float> %fadd
+}
+
+define amdgpu_ps <2 x float> @fadd_v2s32_div(<2 x float> %a, <2 x float> %b) {
+; GCN-LABEL: fadd_v2s32_div:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
+; GCN-NEXT: ; return to shader part epilog
+ %fadd = fadd <2 x float> %a, %b
+ ret <2 x float> %fadd
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll
index 1a7ccf0..588802c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
-; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
define amdgpu_kernel void @fcmp_uniform_select(float %a, i32 %b, i32 %c, ptr addrspace(1) %out) {
; GFX7-LABEL: fcmp_uniform_select:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir
index 67cc016..b6652f6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir
@@ -1,7 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx700 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX7 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GF8 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx700 -run-pass=instruction-select %s -o - | FileCheck -check-prefixes=GFX7 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass=instruction-select %s -o - | FileCheck -check-prefixes=GF8 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select %s -o - | FileCheck -check-prefixes=GFX11 %s
---
name: test_copy_scc_vcc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
index 02d0e52..6facdfd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
@@ -104,109 +104,110 @@ define amdgpu_cs <4 x i32> @abs_sgpr_v4i32(<4 x i32> inreg %arg) {
ret <4 x i32> %res
}
-define amdgpu_cs i16 @abs_vgpr_i16(i16 %arg) {
+define i16 @abs_vgpr_i16(i16 %arg) {
; GFX6-LABEL: abs_vgpr_i16:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0
; GFX6-NEXT: v_max_i32_e32 v0, v0, v1
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_i16:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_sub_u16_e32 v1, 0, v0
; GFX8-NEXT: v_max_i16_e32 v0, v0, v1
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_i16:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_sub_nc_u16 v1, 0, v0
; GFX10-NEXT: v_max_i16 v0, v0, v1
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_i16:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_sub_nc_u16 v1, 0, v0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_max_i16 v0, v0, v1
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
ret i16 %res
}
-define amdgpu_cs i32 @abs_vgpr_i32(i32 %arg) {
+define i32 @abs_vgpr_i32(i32 %arg) {
; GFX6-LABEL: abs_vgpr_i32:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0
; GFX6-NEXT: v_max_i32_e32 v0, v0, v1
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_i32:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 0, v0
; GFX8-NEXT: v_max_i32_e32 v0, v0, v1
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_i32:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0
; GFX10-NEXT: v_max_i32_e32 v0, v0, v1
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_sub_nc_u32_e32 v1, 0, v0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_max_i32_e32 v0, v0, v1
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call i32 @llvm.abs.i32(i32 %arg, i1 false)
ret i32 %res
}
-define amdgpu_cs i64 @abs_vgpr_i64(i64 %arg) {
+define i64 @abs_vgpr_i64(i64 %arg) {
; GFX6-LABEL: abs_vgpr_i64:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX6-NEXT: v_xor_b32_e32 v1, v1, v2
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_i64:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
; GFX8-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX8-NEXT: v_xor_b32_e32 v1, v1, v2
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_i64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
; GFX10-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX10-NEXT: v_xor_b32_e32 v1, v1, v2
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_mov_b32_e32 v3, v2
@@ -214,17 +215,15 @@ define amdgpu_cs i64 @abs_vgpr_i64(i64 %arg) {
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX1250-NEXT: v_xor_b32_e32 v1, v1, v2
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call i64 @llvm.abs.i64(i64 %arg, i1 false)
ret i64 %res
}
-define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
+define <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
; GFX6-LABEL: abs_vgpr_v4i32:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v0
; GFX6-NEXT: v_max_i32_e32 v0, v0, v4
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
@@ -233,14 +232,11 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
; GFX6-NEXT: v_max_i32_e32 v2, v2, v4
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
; GFX6-NEXT: v_max_i32_e32 v3, v3, v4
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: v_readfirstlane_b32 s2, v2
-; GFX6-NEXT: v_readfirstlane_b32 s3, v3
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_v4i32:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v0
; GFX8-NEXT: v_max_i32_e32 v0, v0, v4
; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v1
@@ -249,14 +245,11 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
; GFX8-NEXT: v_max_i32_e32 v2, v2, v4
; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v3
; GFX8-NEXT: v_max_i32_e32 v3, v3, v4
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
-; GFX8-NEXT: v_readfirstlane_b32 s2, v2
-; GFX8-NEXT: v_readfirstlane_b32 s3, v3
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_v4i32:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_sub_nc_u32_e32 v4, 0, v0
; GFX10-NEXT: v_sub_nc_u32_e32 v5, 0, v1
; GFX10-NEXT: v_sub_nc_u32_e32 v6, 0, v2
@@ -265,14 +258,12 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
; GFX10-NEXT: v_max_i32_e32 v1, v1, v5
; GFX10-NEXT: v_max_i32_e32 v2, v2, v6
; GFX10-NEXT: v_max_i32_e32 v3, v3, v7
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: v_readfirstlane_b32 s2, v2
-; GFX10-NEXT: v_readfirstlane_b32 s3, v3
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_v4i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_sub_nc_u32 v4, 0, v0 :: v_dual_sub_nc_u32 v5, 0, v1
; GFX1250-NEXT: v_dual_sub_nc_u32 v6, 0, v2 :: v_dual_sub_nc_u32 v7, 0, v3
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -281,13 +272,7 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_max_i32_e32 v2, v2, v6
; GFX1250-NEXT: v_max_i32_e32 v3, v3, v7
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_readfirstlane_b32 s2, v2
-; GFX1250-NEXT: v_readfirstlane_b32 s3, v3
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %arg, i1 false)
ret <4 x i32> %res
}
@@ -304,44 +289,43 @@ define amdgpu_cs <2 x i8> @abs_sgpr_v2i8(<2 x i8> inreg %arg) {
ret <2 x i8> %res
}
-define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
+define <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
; GFX6-LABEL: abs_vgpr_v2i8:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX6-NEXT: v_max_i32_e32 v0, v0, v2
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1
; GFX6-NEXT: v_max_i32_e32 v1, v1, v2
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_v2i8:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: v_sub_u16_sdwa v3, v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_sub_u16_sdwa v2, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_max_i16_sdwa v0, sext(v0), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX8-NEXT: v_max_i16_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_v2i8:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX10-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX10-NEXT: v_sub_nc_u16 v2, 0, v0
; GFX10-NEXT: v_sub_nc_u16 v3, 0, v1
; GFX10-NEXT: v_max_i16 v0, v0, v2
; GFX10-NEXT: v_max_i16 v1, v1, v3
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_v2i8:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -350,10 +334,7 @@ define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_max_i16 v0, v0, v2
; GFX1250-NEXT: v_max_i16 v1, v1, v3
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call <2 x i8> @llvm.abs.v2i8(<2 x i8> %arg, i1 false)
ret <2 x i8> %res
}
@@ -372,9 +353,10 @@ define amdgpu_cs <3 x i8> @abs_sgpr_v3i8(<3 x i8> inreg %arg) {
ret <3 x i8> %res
}
-define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
+define <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
; GFX6-LABEL: abs_vgpr_v3i8:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8
@@ -384,13 +366,11 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v2
; GFX6-NEXT: v_max_i32_e32 v2, v2, v3
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: v_readfirstlane_b32 s2, v2
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_v3i8:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: v_sub_u16_sdwa v4, v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_max_i16_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -398,13 +378,11 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
; GFX8-NEXT: v_sub_u16_sdwa v3, v3, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_max_i16_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX8-NEXT: v_max_i16_sdwa v2, sext(v2), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
-; GFX8-NEXT: v_readfirstlane_b32 s2, v2
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_v3i8:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX10-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX10-NEXT: v_bfe_i32 v2, v2, 0, 8
@@ -414,13 +392,12 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
; GFX10-NEXT: v_max_i16 v0, v0, v3
; GFX10-NEXT: v_max_i16 v1, v1, v4
; GFX10-NEXT: v_max_i16 v2, v2, v5
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: v_readfirstlane_b32 s2, v2
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_v3i8:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX1250-NEXT: v_bfe_i32 v2, v2, 0, 8
@@ -433,12 +410,7 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_max_i16 v1, v1, v4
; GFX1250-NEXT: v_max_i16 v2, v2, v5
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX1250-NEXT: v_readfirstlane_b32 s2, v2
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call <3 x i8> @llvm.abs.v3i8(<3 x i8> %arg, i1 false)
ret <3 x i8> %res
}
@@ -485,44 +457,44 @@ define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) {
ret <2 x i16> %res
}
-define amdgpu_cs <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) {
+define <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) {
; GFX6-LABEL: abs_vgpr_v2i16:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: v_max_i32_e32 v0, v0, v2
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1
; GFX6-NEXT: v_max_i32_e32 v1, v1, v2
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_v2i16:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: v_sub_u16_e32 v1, 0, v0
; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_max_i16_e32 v1, v0, v1
; GFX8-NEXT: v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_v2i16:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v1, 0, v0
; GFX10-NEXT: v_pk_max_i16 v0, v0, v1
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_v2i16:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_pk_sub_i16 v1, 0, v0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_pk_max_i16 v0, v0, v1
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %arg, i1 false)
ret <2 x i16> %res
}
@@ -576,9 +548,10 @@ define amdgpu_cs <3 x i16> @abs_sgpr_v3i16(<3 x i16> inreg %arg) {
ret <3 x i16> %res
}
-define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
+define <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
; GFX6-LABEL: abs_vgpr_v3i16:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
@@ -588,13 +561,11 @@ define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v2
; GFX6-NEXT: v_max_i32_e32 v2, v2, v3
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: v_readfirstlane_b32 s2, v2
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_v3i16:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: v_sub_u16_e32 v2, 0, v0
; GFX8-NEXT: v_sub_u16_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -603,31 +574,27 @@ define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
; GFX8-NEXT: v_max_i16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
; GFX8-NEXT: v_max_i16_e32 v1, v1, v4
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_v3i16:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v2, 0, v0
; GFX10-NEXT: v_sub_nc_u16 v3, 0, v1
; GFX10-NEXT: v_pk_max_i16 v0, v0, v2
; GFX10-NEXT: v_max_i16 v1, v1, v3
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_v3i16:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_pk_sub_i16 v2, 0, v0
; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_pk_max_i16 v0, v0, v2
; GFX1250-NEXT: v_max_i16 v1, v1, v3
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %arg, i1 false)
ret <3 x i16> %res
}
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll
new file mode 100644
index 0000000..05a0e39
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll
@@ -0,0 +1,325 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx1100 -passes=amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
+
+define amdgpu_kernel void @large_array_vectors_small_users(<16 x i8> %in, <16 x i8> %add, ptr addrspace(3) %out) #0 {
+; OPT-LABEL: define amdgpu_kernel void @large_array_vectors_small_users(
+; OPT-SAME: <16 x i8> [[IN:%.*]], <16 x i8> [[ADD:%.*]], ptr addrspace(3) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-NEXT: [[ENTRY:.*:]]
+; OPT-NEXT: [[ALLOCA:%.*]] = freeze <128 x i8> poison
+; OPT-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[IN]], i64 0
+; OPT-NEXT: [[TMP1:%.*]] = insertelement <128 x i8> [[ALLOCA]], i8 [[TMP0]], i32 0
+; OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT: [[TMP3:%.*]] = insertelement <128 x i8> [[TMP1]], i8 [[TMP2]], i32 1
+; OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT: [[TMP5:%.*]] = insertelement <128 x i8> [[TMP3]], i8 [[TMP4]], i32 2
+; OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT: [[TMP7:%.*]] = insertelement <128 x i8> [[TMP5]], i8 [[TMP6]], i32 3
+; OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT: [[TMP9:%.*]] = insertelement <128 x i8> [[TMP7]], i8 [[TMP8]], i32 4
+; OPT-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT: [[TMP11:%.*]] = insertelement <128 x i8> [[TMP9]], i8 [[TMP10]], i32 5
+; OPT-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT: [[TMP13:%.*]] = insertelement <128 x i8> [[TMP11]], i8 [[TMP12]], i32 6
+; OPT-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT: [[TMP15:%.*]] = insertelement <128 x i8> [[TMP13]], i8 [[TMP14]], i32 7
+; OPT-NEXT: [[TMP16:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT: [[TMP17:%.*]] = insertelement <128 x i8> [[TMP15]], i8 [[TMP16]], i32 8
+; OPT-NEXT: [[TMP18:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT: [[TMP19:%.*]] = insertelement <128 x i8> [[TMP17]], i8 [[TMP18]], i32 9
+; OPT-NEXT: [[TMP20:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT: [[TMP21:%.*]] = insertelement <128 x i8> [[TMP19]], i8 [[TMP20]], i32 10
+; OPT-NEXT: [[TMP22:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT: [[TMP23:%.*]] = insertelement <128 x i8> [[TMP21]], i8 [[TMP22]], i32 11
+; OPT-NEXT: [[TMP24:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT: [[TMP25:%.*]] = insertelement <128 x i8> [[TMP23]], i8 [[TMP24]], i32 12
+; OPT-NEXT: [[TMP26:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT: [[TMP27:%.*]] = insertelement <128 x i8> [[TMP25]], i8 [[TMP26]], i32 13
+; OPT-NEXT: [[TMP28:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT: [[TMP29:%.*]] = insertelement <128 x i8> [[TMP27]], i8 [[TMP28]], i32 14
+; OPT-NEXT: [[TMP30:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT: [[TMP31:%.*]] = insertelement <128 x i8> [[TMP29]], i8 [[TMP30]], i32 15
+; OPT-NEXT: [[TMP32:%.*]] = extractelement <16 x i8> [[IN]], i64 0
+; OPT-NEXT: [[TMP33:%.*]] = insertelement <128 x i8> [[TMP31]], i8 [[TMP32]], i32 0
+; OPT-NEXT: [[TMP34:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT: [[TMP35:%.*]] = insertelement <128 x i8> [[TMP33]], i8 [[TMP34]], i32 1
+; OPT-NEXT: [[TMP36:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT: [[TMP37:%.*]] = insertelement <128 x i8> [[TMP35]], i8 [[TMP36]], i32 2
+; OPT-NEXT: [[TMP38:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT: [[TMP39:%.*]] = insertelement <128 x i8> [[TMP37]], i8 [[TMP38]], i32 3
+; OPT-NEXT: [[TMP40:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT: [[TMP41:%.*]] = insertelement <128 x i8> [[TMP39]], i8 [[TMP40]], i32 4
+; OPT-NEXT: [[TMP42:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT: [[TMP43:%.*]] = insertelement <128 x i8> [[TMP41]], i8 [[TMP42]], i32 5
+; OPT-NEXT: [[TMP44:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT: [[TMP45:%.*]] = insertelement <128 x i8> [[TMP43]], i8 [[TMP44]], i32 6
+; OPT-NEXT: [[TMP46:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT: [[TMP47:%.*]] = insertelement <128 x i8> [[TMP45]], i8 [[TMP46]], i32 7
+; OPT-NEXT: [[TMP48:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT: [[TMP49:%.*]] = insertelement <128 x i8> [[TMP47]], i8 [[TMP48]], i32 8
+; OPT-NEXT: [[TMP50:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT: [[TMP51:%.*]] = insertelement <128 x i8> [[TMP49]], i8 [[TMP50]], i32 9
+; OPT-NEXT: [[TMP52:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT: [[TMP53:%.*]] = insertelement <128 x i8> [[TMP51]], i8 [[TMP52]], i32 10
+; OPT-NEXT: [[TMP54:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT: [[TMP55:%.*]] = insertelement <128 x i8> [[TMP53]], i8 [[TMP54]], i32 11
+; OPT-NEXT: [[TMP56:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT: [[TMP57:%.*]] = insertelement <128 x i8> [[TMP55]], i8 [[TMP56]], i32 12
+; OPT-NEXT: [[TMP58:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT: [[TMP59:%.*]] = insertelement <128 x i8> [[TMP57]], i8 [[TMP58]], i32 13
+; OPT-NEXT: [[TMP60:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT: [[TMP61:%.*]] = insertelement <128 x i8> [[TMP59]], i8 [[TMP60]], i32 14
+; OPT-NEXT: [[TMP62:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT: [[TMP63:%.*]] = insertelement <128 x i8> [[TMP61]], i8 [[TMP62]], i32 15
+; OPT-NEXT: [[TMP64:%.*]] = extractelement <16 x i8> [[IN]], i64 0
+; OPT-NEXT: [[TMP65:%.*]] = insertelement <128 x i8> [[TMP63]], i8 [[TMP64]], i32 0
+; OPT-NEXT: [[TMP66:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT: [[TMP67:%.*]] = insertelement <128 x i8> [[TMP65]], i8 [[TMP66]], i32 1
+; OPT-NEXT: [[TMP68:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT: [[TMP69:%.*]] = insertelement <128 x i8> [[TMP67]], i8 [[TMP68]], i32 2
+; OPT-NEXT: [[TMP70:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT: [[TMP71:%.*]] = insertelement <128 x i8> [[TMP69]], i8 [[TMP70]], i32 3
+; OPT-NEXT: [[TMP72:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT: [[TMP73:%.*]] = insertelement <128 x i8> [[TMP71]], i8 [[TMP72]], i32 4
+; OPT-NEXT: [[TMP74:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT: [[TMP75:%.*]] = insertelement <128 x i8> [[TMP73]], i8 [[TMP74]], i32 5
+; OPT-NEXT: [[TMP76:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT: [[TMP77:%.*]] = insertelement <128 x i8> [[TMP75]], i8 [[TMP76]], i32 6
+; OPT-NEXT: [[TMP78:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT: [[TMP79:%.*]] = insertelement <128 x i8> [[TMP77]], i8 [[TMP78]], i32 7
+; OPT-NEXT: [[TMP80:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT: [[TMP81:%.*]] = insertelement <128 x i8> [[TMP79]], i8 [[TMP80]], i32 8
+; OPT-NEXT: [[TMP82:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT: [[TMP83:%.*]] = insertelement <128 x i8> [[TMP81]], i8 [[TMP82]], i32 9
+; OPT-NEXT: [[TMP84:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT: [[TMP85:%.*]] = insertelement <128 x i8> [[TMP83]], i8 [[TMP84]], i32 10
+; OPT-NEXT: [[TMP86:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT: [[TMP87:%.*]] = insertelement <128 x i8> [[TMP85]], i8 [[TMP86]], i32 11
+; OPT-NEXT: [[TMP88:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT: [[TMP89:%.*]] = insertelement <128 x i8> [[TMP87]], i8 [[TMP88]], i32 12
+; OPT-NEXT: [[TMP90:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT: [[TMP91:%.*]] = insertelement <128 x i8> [[TMP89]], i8 [[TMP90]], i32 13
+; OPT-NEXT: [[TMP92:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT: [[TMP93:%.*]] = insertelement <128 x i8> [[TMP91]], i8 [[TMP92]], i32 14
+; OPT-NEXT: [[TMP94:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT: [[TMP95:%.*]] = insertelement <128 x i8> [[TMP93]], i8 [[TMP94]], i32 15
+; OPT-NEXT: [[TMP96:%.*]] = extractelement <16 x i8> [[IN]], i64 0
+; OPT-NEXT: [[TMP97:%.*]] = insertelement <128 x i8> [[TMP95]], i8 [[TMP96]], i32 0
+; OPT-NEXT: [[TMP98:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT: [[TMP99:%.*]] = insertelement <128 x i8> [[TMP97]], i8 [[TMP98]], i32 1
+; OPT-NEXT: [[TMP100:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT: [[TMP101:%.*]] = insertelement <128 x i8> [[TMP99]], i8 [[TMP100]], i32 2
+; OPT-NEXT: [[TMP102:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT: [[TMP103:%.*]] = insertelement <128 x i8> [[TMP101]], i8 [[TMP102]], i32 3
+; OPT-NEXT: [[TMP104:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT: [[TMP105:%.*]] = insertelement <128 x i8> [[TMP103]], i8 [[TMP104]], i32 4
+; OPT-NEXT: [[TMP106:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT: [[TMP107:%.*]] = insertelement <128 x i8> [[TMP105]], i8 [[TMP106]], i32 5
+; OPT-NEXT: [[TMP108:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT: [[TMP109:%.*]] = insertelement <128 x i8> [[TMP107]], i8 [[TMP108]], i32 6
+; OPT-NEXT: [[TMP110:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT: [[TMP111:%.*]] = insertelement <128 x i8> [[TMP109]], i8 [[TMP110]], i32 7
+; OPT-NEXT: [[TMP112:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT: [[TMP113:%.*]] = insertelement <128 x i8> [[TMP111]], i8 [[TMP112]], i32 8
+; OPT-NEXT: [[TMP114:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT: [[TMP115:%.*]] = insertelement <128 x i8> [[TMP113]], i8 [[TMP114]], i32 9
+; OPT-NEXT: [[TMP116:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT: [[TMP117:%.*]] = insertelement <128 x i8> [[TMP115]], i8 [[TMP116]], i32 10
+; OPT-NEXT: [[TMP118:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT: [[TMP119:%.*]] = insertelement <128 x i8> [[TMP117]], i8 [[TMP118]], i32 11
+; OPT-NEXT: [[TMP120:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT: [[TMP121:%.*]] = insertelement <128 x i8> [[TMP119]], i8 [[TMP120]], i32 12
+; OPT-NEXT: [[TMP122:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT: [[TMP123:%.*]] = insertelement <128 x i8> [[TMP121]], i8 [[TMP122]], i32 13
+; OPT-NEXT: [[TMP124:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT: [[TMP125:%.*]] = insertelement <128 x i8> [[TMP123]], i8 [[TMP124]], i32 14
+; OPT-NEXT: [[TMP126:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT: [[TMP127:%.*]] = insertelement <128 x i8> [[TMP125]], i8 [[TMP126]], i32 15
+; OPT-NEXT: [[TMP128:%.*]] = extractelement <16 x i8> [[IN]], i64 0
+; OPT-NEXT: [[TMP129:%.*]] = insertelement <128 x i8> [[TMP127]], i8 [[TMP128]], i32 0
+; OPT-NEXT: [[TMP130:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT: [[TMP131:%.*]] = insertelement <128 x i8> [[TMP129]], i8 [[TMP130]], i32 1
+; OPT-NEXT: [[TMP132:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT: [[TMP133:%.*]] = insertelement <128 x i8> [[TMP131]], i8 [[TMP132]], i32 2
+; OPT-NEXT: [[TMP134:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT: [[TMP135:%.*]] = insertelement <128 x i8> [[TMP133]], i8 [[TMP134]], i32 3
+; OPT-NEXT: [[TMP136:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT: [[TMP137:%.*]] = insertelement <128 x i8> [[TMP135]], i8 [[TMP136]], i32 4
+; OPT-NEXT: [[TMP138:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT: [[TMP139:%.*]] = insertelement <128 x i8> [[TMP137]], i8 [[TMP138]], i32 5
+; OPT-NEXT: [[TMP140:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT: [[TMP141:%.*]] = insertelement <128 x i8> [[TMP139]], i8 [[TMP140]], i32 6
+; OPT-NEXT: [[TMP142:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT: [[TMP143:%.*]] = insertelement <128 x i8> [[TMP141]], i8 [[TMP142]], i32 7
+; OPT-NEXT: [[TMP144:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT: [[TMP145:%.*]] = insertelement <128 x i8> [[TMP143]], i8 [[TMP144]], i32 8
+; OPT-NEXT: [[TMP146:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT: [[TMP147:%.*]] = insertelement <128 x i8> [[TMP145]], i8 [[TMP146]], i32 9
+; OPT-NEXT: [[TMP148:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT: [[TMP149:%.*]] = insertelement <128 x i8> [[TMP147]], i8 [[TMP148]], i32 10
+; OPT-NEXT: [[TMP150:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT: [[TMP151:%.*]] = insertelement <128 x i8> [[TMP149]], i8 [[TMP150]], i32 11
+; OPT-NEXT: [[TMP152:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT: [[TMP153:%.*]] = insertelement <128 x i8> [[TMP151]], i8 [[TMP152]], i32 12
+; OPT-NEXT: [[TMP154:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT: [[TMP155:%.*]] = insertelement <128 x i8> [[TMP153]], i8 [[TMP154]], i32 13
+; OPT-NEXT: [[TMP156:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT: [[TMP157:%.*]] = insertelement <128 x i8> [[TMP155]], i8 [[TMP156]], i32 14
+; OPT-NEXT: [[TMP158:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT: [[TMP159:%.*]] = insertelement <128 x i8> [[TMP157]], i8 [[TMP158]], i32 15
+; OPT-NEXT: [[TMP160:%.*]] = extractelement <16 x i8> [[IN]], i64 0
+; OPT-NEXT: [[TMP161:%.*]] = insertelement <128 x i8> [[TMP159]], i8 [[TMP160]], i32 0
+; OPT-NEXT: [[TMP162:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT: [[TMP163:%.*]] = insertelement <128 x i8> [[TMP161]], i8 [[TMP162]], i32 1
+; OPT-NEXT: [[TMP164:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT: [[TMP165:%.*]] = insertelement <128 x i8> [[TMP163]], i8 [[TMP164]], i32 2
+; OPT-NEXT: [[TMP166:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT: [[TMP167:%.*]] = insertelement <128 x i8> [[TMP165]], i8 [[TMP166]], i32 3
+; OPT-NEXT: [[TMP168:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT: [[TMP169:%.*]] = insertelement <128 x i8> [[TMP167]], i8 [[TMP168]], i32 4
+; OPT-NEXT: [[TMP170:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT: [[TMP171:%.*]] = insertelement <128 x i8> [[TMP169]], i8 [[TMP170]], i32 5
+; OPT-NEXT: [[TMP172:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT: [[TMP173:%.*]] = insertelement <128 x i8> [[TMP171]], i8 [[TMP172]], i32 6
+; OPT-NEXT: [[TMP174:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT: [[TMP175:%.*]] = insertelement <128 x i8> [[TMP173]], i8 [[TMP174]], i32 7
+; OPT-NEXT: [[TMP176:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT: [[TMP177:%.*]] = insertelement <128 x i8> [[TMP175]], i8 [[TMP176]], i32 8
+; OPT-NEXT: [[TMP178:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT: [[TMP179:%.*]] = insertelement <128 x i8> [[TMP177]], i8 [[TMP178]], i32 9
+; OPT-NEXT: [[TMP180:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT: [[TMP181:%.*]] = insertelement <128 x i8> [[TMP179]], i8 [[TMP180]], i32 10
+; OPT-NEXT: [[TMP182:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT: [[TMP183:%.*]] = insertelement <128 x i8> [[TMP181]], i8 [[TMP182]], i32 11
+; OPT-NEXT: [[TMP184:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT: [[TMP185:%.*]] = insertelement <128 x i8> [[TMP183]], i8 [[TMP184]], i32 12
+; OPT-NEXT: [[TMP186:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT: [[TMP187:%.*]] = insertelement <128 x i8> [[TMP185]], i8 [[TMP186]], i32 13
+; OPT-NEXT: [[TMP188:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT: [[TMP189:%.*]] = insertelement <128 x i8> [[TMP187]], i8 [[TMP188]], i32 14
+; OPT-NEXT: [[TMP190:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT: [[TMP191:%.*]] = insertelement <128 x i8> [[TMP189]], i8 [[TMP190]], i32 15
+; OPT-NEXT: [[TMP192:%.*]] = extractelement <16 x i8> [[IN]], i64 0
+; OPT-NEXT: [[TMP193:%.*]] = insertelement <128 x i8> [[TMP191]], i8 [[TMP192]], i32 0
+; OPT-NEXT: [[TMP194:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT: [[TMP195:%.*]] = insertelement <128 x i8> [[TMP193]], i8 [[TMP194]], i32 1
+; OPT-NEXT: [[TMP196:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT: [[TMP197:%.*]] = insertelement <128 x i8> [[TMP195]], i8 [[TMP196]], i32 2
+; OPT-NEXT: [[TMP198:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT: [[TMP199:%.*]] = insertelement <128 x i8> [[TMP197]], i8 [[TMP198]], i32 3
+; OPT-NEXT: [[TMP200:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT: [[TMP201:%.*]] = insertelement <128 x i8> [[TMP199]], i8 [[TMP200]], i32 4
+; OPT-NEXT: [[TMP202:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT: [[TMP203:%.*]] = insertelement <128 x i8> [[TMP201]], i8 [[TMP202]], i32 5
+; OPT-NEXT: [[TMP204:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT: [[TMP205:%.*]] = insertelement <128 x i8> [[TMP203]], i8 [[TMP204]], i32 6
+; OPT-NEXT: [[TMP206:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT: [[TMP207:%.*]] = insertelement <128 x i8> [[TMP205]], i8 [[TMP206]], i32 7
+; OPT-NEXT: [[TMP208:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT: [[TMP209:%.*]] = insertelement <128 x i8> [[TMP207]], i8 [[TMP208]], i32 8
+; OPT-NEXT: [[TMP210:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT: [[TMP211:%.*]] = insertelement <128 x i8> [[TMP209]], i8 [[TMP210]], i32 9
+; OPT-NEXT: [[TMP212:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT: [[TMP213:%.*]] = insertelement <128 x i8> [[TMP211]], i8 [[TMP212]], i32 10
+; OPT-NEXT: [[TMP214:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT: [[TMP215:%.*]] = insertelement <128 x i8> [[TMP213]], i8 [[TMP214]], i32 11
+; OPT-NEXT: [[TMP216:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT: [[TMP217:%.*]] = insertelement <128 x i8> [[TMP215]], i8 [[TMP216]], i32 12
+; OPT-NEXT: [[TMP218:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT: [[TMP219:%.*]] = insertelement <128 x i8> [[TMP217]], i8 [[TMP218]], i32 13
+; OPT-NEXT: [[TMP220:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT: [[TMP221:%.*]] = insertelement <128 x i8> [[TMP219]], i8 [[TMP220]], i32 14
+; OPT-NEXT: [[TMP222:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT: [[TMP223:%.*]] = insertelement <128 x i8> [[TMP221]], i8 [[TMP222]], i32 15
+; OPT-NEXT: [[TMP224:%.*]] = extractelement <16 x i8> [[IN]], i64 0
+; OPT-NEXT: [[TMP225:%.*]] = insertelement <128 x i8> [[TMP223]], i8 [[TMP224]], i32 0
+; OPT-NEXT: [[TMP226:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT: [[TMP227:%.*]] = insertelement <128 x i8> [[TMP225]], i8 [[TMP226]], i32 1
+; OPT-NEXT: [[TMP228:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT: [[TMP229:%.*]] = insertelement <128 x i8> [[TMP227]], i8 [[TMP228]], i32 2
+; OPT-NEXT: [[TMP230:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT: [[TMP231:%.*]] = insertelement <128 x i8> [[TMP229]], i8 [[TMP230]], i32 3
+; OPT-NEXT: [[TMP232:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT: [[TMP233:%.*]] = insertelement <128 x i8> [[TMP231]], i8 [[TMP232]], i32 4
+; OPT-NEXT: [[TMP234:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT: [[TMP235:%.*]] = insertelement <128 x i8> [[TMP233]], i8 [[TMP234]], i32 5
+; OPT-NEXT: [[TMP236:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT: [[TMP237:%.*]] = insertelement <128 x i8> [[TMP235]], i8 [[TMP236]], i32 6
+; OPT-NEXT: [[TMP238:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT: [[TMP239:%.*]] = insertelement <128 x i8> [[TMP237]], i8 [[TMP238]], i32 7
+; OPT-NEXT: [[TMP240:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT: [[TMP241:%.*]] = insertelement <128 x i8> [[TMP239]], i8 [[TMP240]], i32 8
+; OPT-NEXT: [[TMP242:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT: [[TMP243:%.*]] = insertelement <128 x i8> [[TMP241]], i8 [[TMP242]], i32 9
+; OPT-NEXT: [[TMP244:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT: [[TMP245:%.*]] = insertelement <128 x i8> [[TMP243]], i8 [[TMP244]], i32 10
+; OPT-NEXT: [[TMP246:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT: [[TMP247:%.*]] = insertelement <128 x i8> [[TMP245]], i8 [[TMP246]], i32 11
+; OPT-NEXT: [[TMP248:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT: [[TMP249:%.*]] = insertelement <128 x i8> [[TMP247]], i8 [[TMP248]], i32 12
+; OPT-NEXT: [[TMP250:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT: [[TMP251:%.*]] = insertelement <128 x i8> [[TMP249]], i8 [[TMP250]], i32 13
+; OPT-NEXT: [[TMP252:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT: [[TMP253:%.*]] = insertelement <128 x i8> [[TMP251]], i8 [[TMP252]], i32 14
+; OPT-NEXT: [[TMP254:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT: [[TMP255:%.*]] = insertelement <128 x i8> [[TMP253]], i8 [[TMP254]], i32 15
+; OPT-NEXT: [[TMP256:%.*]] = extractelement <128 x i8> [[TMP255]], i32 80
+; OPT-NEXT: [[TMP257:%.*]] = insertelement <16 x i8> poison, i8 [[TMP256]], i64 0
+; OPT-NEXT: [[TMP258:%.*]] = extractelement <128 x i8> [[TMP255]], i32 81
+; OPT-NEXT: [[TMP259:%.*]] = insertelement <16 x i8> [[TMP257]], i8 [[TMP258]], i64 1
+; OPT-NEXT: [[TMP260:%.*]] = extractelement <128 x i8> [[TMP255]], i32 82
+; OPT-NEXT: [[TMP261:%.*]] = insertelement <16 x i8> [[TMP259]], i8 [[TMP260]], i64 2
+; OPT-NEXT: [[TMP262:%.*]] = extractelement <128 x i8> [[TMP255]], i32 83
+; OPT-NEXT: [[TMP263:%.*]] = insertelement <16 x i8> [[TMP261]], i8 [[TMP262]], i64 3
+; OPT-NEXT: [[TMP264:%.*]] = extractelement <128 x i8> [[TMP255]], i32 84
+; OPT-NEXT: [[TMP265:%.*]] = insertelement <16 x i8> [[TMP263]], i8 [[TMP264]], i64 4
+; OPT-NEXT: [[TMP266:%.*]] = extractelement <128 x i8> [[TMP255]], i32 85
+; OPT-NEXT: [[TMP267:%.*]] = insertelement <16 x i8> [[TMP265]], i8 [[TMP266]], i64 5
+; OPT-NEXT: [[TMP268:%.*]] = extractelement <128 x i8> [[TMP255]], i32 86
+; OPT-NEXT: [[TMP269:%.*]] = insertelement <16 x i8> [[TMP267]], i8 [[TMP268]], i64 6
+; OPT-NEXT: [[TMP270:%.*]] = extractelement <128 x i8> [[TMP255]], i32 87
+; OPT-NEXT: [[TMP271:%.*]] = insertelement <16 x i8> [[TMP269]], i8 [[TMP270]], i64 7
+; OPT-NEXT: [[TMP272:%.*]] = extractelement <128 x i8> [[TMP255]], i32 88
+; OPT-NEXT: [[TMP273:%.*]] = insertelement <16 x i8> [[TMP271]], i8 [[TMP272]], i64 8
+; OPT-NEXT: [[TMP274:%.*]] = extractelement <128 x i8> [[TMP255]], i32 89
+; OPT-NEXT: [[TMP275:%.*]] = insertelement <16 x i8> [[TMP273]], i8 [[TMP274]], i64 9
+; OPT-NEXT: [[TMP276:%.*]] = extractelement <128 x i8> [[TMP255]], i32 90
+; OPT-NEXT: [[TMP277:%.*]] = insertelement <16 x i8> [[TMP275]], i8 [[TMP276]], i64 10
+; OPT-NEXT: [[TMP278:%.*]] = extractelement <128 x i8> [[TMP255]], i32 91
+; OPT-NEXT: [[TMP279:%.*]] = insertelement <16 x i8> [[TMP277]], i8 [[TMP278]], i64 11
+; OPT-NEXT: [[TMP280:%.*]] = extractelement <128 x i8> [[TMP255]], i32 92
+; OPT-NEXT: [[TMP281:%.*]] = insertelement <16 x i8> [[TMP279]], i8 [[TMP280]], i64 12
+; OPT-NEXT: [[TMP282:%.*]] = extractelement <128 x i8> [[TMP255]], i32 93
+; OPT-NEXT: [[TMP283:%.*]] = insertelement <16 x i8> [[TMP281]], i8 [[TMP282]], i64 13
+; OPT-NEXT: [[TMP284:%.*]] = extractelement <128 x i8> [[TMP255]], i32 94
+; OPT-NEXT: [[TMP285:%.*]] = insertelement <16 x i8> [[TMP283]], i8 [[TMP284]], i64 14
+; OPT-NEXT: [[TMP286:%.*]] = extractelement <128 x i8> [[TMP255]], i32 95
+; OPT-NEXT: [[TMP287:%.*]] = insertelement <16 x i8> [[TMP285]], i8 [[TMP286]], i64 15
+; OPT-NEXT: [[SUM:%.*]] = add <16 x i8> [[TMP287]], [[ADD]]
+; OPT-NEXT: store <16 x i8> [[SUM]], ptr addrspace(3) [[OUT]], align 16
+; OPT-NEXT: ret void
+;
+entry:
+ %alloca = alloca [8 x <16 x i8>], align 16, addrspace(5)
+ %gep0 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 0
+ store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
+ %gep1 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 1
+ store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
+ %gep2 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 2
+ store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
+ %gep3 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 3
+ store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
+ %gep4 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 4
+ store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
+ %gep5 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 5
+ store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
+ %gep6 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 6
+ store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
+ %gep7 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 7
+ store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
+ %load = load <16 x i8>, ptr addrspace(5) %gep5, align 16
+ %sum = add <16 x i8> %load, %add
+ store <16 x i8> %sum, ptr addrspace(3) %out, align 16
+ ret void
+}
+
+attributes #0 = {"amdgpu-waves-per-eu"="2,2"}
diff --git a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
index f67cbe3..ddb522a8 100644
--- a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
+++ b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
@@ -1,17 +1,17 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s
; -global-isel=1 SI run line skipped since store not yet implemented.
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=MEMTIME -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=MEMTIME -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=MEMTIME -check-prefix=GCN %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=MEMTIME -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=MEMTIME -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=MEMTIME -check-prefix=GCN %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-SDAG -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-GISEL -check-prefix=GCN %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-GISEL -check-prefix=GCN %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s
declare i64 @llvm.readcyclecounter() #0
diff --git a/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll b/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll
new file mode 100644
index 0000000..22e4a24
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll
@@ -0,0 +1,236 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+
+define i16 @v_underflow_compare_fold_i16(i16 %a, i16 %b) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sub_u16_e32 v1, v0, v1
+; GFX9-NEXT: v_min_u16_e32 v0, v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sub_nc_u16 v0.h, v0.l, v1.l
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_min_u16 v0.l, v0.h, v0.l
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %sub = sub i16 %a, %b
+ %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a)
+ ret i16 %cond
+}
+
+define i32 @v_underflow_compare_fold_i32(i32 %a, i32 %b) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sub_u32_e32 v1, v0, v1
+; GFX9-NEXT: v_min_u32_e32 v0, v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sub_nc_u32_e32 v1, v0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_min_u32_e32 v0, v1, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %sub = sub i32 %a, %b
+ %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+ ret i32 %cond
+}
+
+define i32 @v_underflow_compare_fold_i32_commute(i32 %a, i32 %b) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i32_commute:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sub_u32_e32 v1, v0, v1
+; GFX9-NEXT: v_min_u32_e32 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i32_commute:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sub_nc_u32_e32 v1, v0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_min_u32_e32 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %sub = sub i32 %a, %b
+ %cond = call i32 @llvm.umin.i32(i32 %a, i32 %sub)
+ ret i32 %cond
+}
+
+define i32 @v_underflow_compare_fold_i32_multi_use(i32 %a, i32 %b, ptr addrspace(1) %ptr) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i32_multi_use:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sub_u32_e32 v1, v0, v1
+; GFX9-NEXT: v_min_u32_e32 v0, v1, v0
+; GFX9-NEXT: global_store_dword v[2:3], v1, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i32_multi_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sub_nc_u32_e32 v1, v0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_min_u32_e32 v0, v1, v0
+; GFX11-NEXT: global_store_b32 v[2:3], v1, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %sub = sub i32 %a, %b
+ store i32 %sub, ptr addrspace(1) %ptr
+ %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+ ret i32 %cond
+}
+
+define i64 @v_underflow_compare_fold_i64(i64 %a, i64 %b) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_cndmask_b32 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %sub = sub i64 %a, %b
+ %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+ ret i64 %cond
+}
+
+define i64 @v_underflow_compare_fold_i64_commute(i64 %a, i64 %b) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i64_commute:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i64_commute:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %sub = sub i64 %a, %b
+ %cond = call i64 @llvm.umin.i64(i64 %a, i64 %sub)
+ ret i64 %cond
+}
+
+define i64 @v_underflow_compare_fold_i64_multi_use(i64 %a, i64 %b, ptr addrspace(1) %ptr) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i64_multi_use:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
+; GFX9-NEXT: global_store_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i64_multi_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT: global_store_b64 v[4:5], v[2:3], off
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_cndmask_b32 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %sub = sub i64 %a, %b
+ store i64 %sub, ptr addrspace(1) %ptr
+ %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+ ret i64 %cond
+}
+
+define amdgpu_ps i16 @s_underflow_compare_fold_i16(i16 inreg %a, i16 inreg %b) #0 {
+; GFX9-LABEL: s_underflow_compare_fold_i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_sub_i32 s1, s0, s1
+; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX9-NEXT: s_min_u32 s0, s1, s0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_underflow_compare_fold_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_sub_i32 s1, s0, s1
+; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX11-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_min_u32 s0, s1, s0
+; GFX11-NEXT: ; return to shader part epilog
+ %sub = sub i16 %a, %b
+ %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a)
+ ret i16 %cond
+}
+
+define amdgpu_ps i32 @s_underflow_compare_fold_i32(i32 inreg %a, i32 inreg %b) #0 {
+; GFX9-LABEL: s_underflow_compare_fold_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_sub_i32 s1, s0, s1
+; GFX9-NEXT: s_min_u32 s0, s1, s0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_underflow_compare_fold_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_sub_i32 s1, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_min_u32 s0, s1, s0
+; GFX11-NEXT: ; return to shader part epilog
+ %sub = sub i32 %a, %b
+ %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+ ret i32 %cond
+}
+
+define amdgpu_ps i64 @s_underflow_compare_fold_i64(i64 inreg %a, i64 inreg %b) #0 {
+; GFX9-LABEL: s_underflow_compare_fold_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_sub_u32 s2, s0, s2
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: s_subb_u32 s3, s1, s3
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
+; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
+; GFX9-NEXT: s_cselect_b32 s1, s3, s1
+; GFX9-NEXT: s_cselect_b32 s0, s2, s0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_underflow_compare_fold_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_sub_u32 s2, s0, s2
+; GFX11-NEXT: s_subb_u32 s3, s1, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[2:3], s[0:1]
+; GFX11-NEXT: s_and_b32 s4, s4, exec_lo
+; GFX11-NEXT: s_cselect_b32 s0, s2, s0
+; GFX11-NEXT: s_cselect_b32 s1, s3, s1
+; GFX11-NEXT: ; return to shader part epilog
+ %sub = sub i64 %a, %b
+ %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+ ret i64 %cond
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir
index 8a70a8a..32cc398 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir
@@ -36,7 +36,7 @@ body: |
; GCN-NEXT: v_add_f16_e64 v128.l /*v384.l*/, v129.l /*v385.l*/, v130.l /*v386.l*/
$vgpr384_lo16 = V_ADD_F16_t16_e64 0, undef $vgpr385_lo16, 0, undef $vgpr386_lo16, 0, 0, 0, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0x8a
+ ; GCN-NEXT: s_set_vgpr_msb 0x458a
; ASM-SAME: ; msbs: dst=2 src0=2 src1=2 src2=0
; GCN-NEXT: v_add_f16_e64 v0.h /*v512.h*/, v1.h /*v513.h*/, v2.h /*v514.h*/
$vgpr512_hi16 = V_ADD_F16_t16_e64 0, undef $vgpr513_hi16, 0, undef $vgpr514_hi16, 0, 0, 0, implicit $exec, implicit $mode
@@ -50,7 +50,7 @@ body: |
; GCN-NEXT: v_add_f16_e64 v128.l /*v640.l*/, v129.l /*v641.l*/, v130.l /*v642.l*/
$vgpr640_lo16 = V_ADD_F16_t16_e64 0, undef $vgpr641_lo16, 0, undef $vgpr642_lo16, 0, 0, 0, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0xcf
+ ; GCN-NEXT: s_set_vgpr_msb 0x8acf
; ASM-SAME: ; msbs: dst=3 src0=3 src1=3 src2=0
; GCN-NEXT: v_add_f16_e64 v0.h /*v768.h*/, v1.h /*v769.h*/, v2.h /*v770.h*/
$vgpr768_hi16 = V_ADD_F16_t16_e64 0, undef $vgpr769_hi16, 0, undef $vgpr770_hi16, 0, 0, 0, implicit $exec, implicit $mode
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir
index f508df2..7e1c28f 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir
@@ -22,13 +22,13 @@ body: |
$vgpr257 = V_MOV_B32_e32 undef $vgpr510, implicit $exec
; Single bit change
- ; GCN-NEXT: s_set_vgpr_msb 1
+ ; GCN-NEXT: s_set_vgpr_msb 0x4101
; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=0
; GCN-NEXT: v_rcp_f32_e64 v255, v2 /*v258*/
$vgpr255 = V_RCP_F32_e64 0, undef $vgpr258, 0, 0, implicit $exec, implicit $mode
; Reset
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x100
; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0
; GCN-NEXT: v_rcp_f32_e64 v255, v1
$vgpr255 = V_RCP_F32_e64 0, undef $vgpr1, 0, 0, implicit $exec, implicit $mode
@@ -40,7 +40,7 @@ body: |
; GCN-NEXT: v_add_nc_u32_e32 v0, v253 /*v509*/, v252 /*v508*/
$vgpr0 = V_ADD_U32_e32 undef $vgpr509, undef $vgpr508, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 0x44
+ ; GCN-NEXT: s_set_vgpr_msb 0x544
; ASM-SAME: ; msbs: dst=1 src0=0 src1=1 src2=0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_add_f32_e64 v2 /*v258*/, v0, v251 /*v507*/
@@ -48,7 +48,7 @@ body: |
; VOP3
- ; GCN-NEXT: s_set_vgpr_msb 0x55
+ ; GCN-NEXT: s_set_vgpr_msb 0x4455
; ASM-SAME: ; msbs: dst=1 src0=1 src1=1 src2=1
; GCN-NEXT: v_fma_f32 v3 /*v259*/, v4 /*v260*/, v5 /*v261*/, v6 /*v262*/
$vgpr259 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr261, 0, undef $vgpr262, 0, 0, implicit $exec, implicit $mode
@@ -58,32 +58,32 @@ body: |
$vgpr259 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr261, 0, undef $vgpr262, 0, 0, implicit $exec, implicit $mode
; Tuple crossing the 256 boundary
- ; GCN-NEXT: s_set_vgpr_msb 17
+ ; GCN-NEXT: s_set_vgpr_msb 0x5511
; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=1
; GCN-NEXT: v_mqsad_u32_u8 v[254:257], v[2:3] /*v[258:259]*/, v0, v[244:247] /*v[500:503]*/
$vgpr254_vgpr255_vgpr256_vgpr257 = V_MQSAD_U32_U8_e64 $vgpr258_vgpr259, $vgpr0, undef $vgpr500_vgpr501_vgpr502_vgpr503, 0, implicit $exec
; DPP/tied operand
- ; GCN-NEXT: s_set_vgpr_msb 0x45
+ ; GCN-NEXT: s_set_vgpr_msb 0x1145
; ASM-SAME: ; msbs: dst=1 src0=1 src1=1 src2=0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_add_nc_u16_e64_dpp v0 /*v256*/, v1 /*v257*/, v2 /*v258*/ quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
$vgpr256 = V_ADD_NC_U16_fake16_e64_dpp $vgpr256, 0, $vgpr257, 0, undef $vgpr258, 0, 0, 1, 15, 15, 1, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 17
+ ; GCN-NEXT: s_set_vgpr_msb 0x4511
; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=1
; GCN-NEXT: v_add3_u32_e64_dpp v0, v1 /*v257*/, v0, v2 /*v258*/ quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
$vgpr0 = V_ADD3_U32_e64_dpp $vgpr0, $vgpr257, $vgpr0, undef $vgpr258, 1, 15, 15, 1, implicit $exec
; DS (addr, data0, and data1 operands)
- ; GCN-NEXT: s_set_vgpr_msb 20
+ ; GCN-NEXT: s_set_vgpr_msb 0x1114
; ASM-SAME: ; msbs: dst=0 src0=0 src1=1 src2=1
; GCN-NEXT: ds_store_2addr_b32 v0, v248 /*v504*/, v249 /*v505*/ offset1:1
DS_WRITE2_B32_gfx9 $vgpr0, undef $vgpr504, undef $vgpr505, 0, 1, 0, implicit $exec
; Reset
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x1400
; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0
; GCN-NEXT: ds_store_2addr_b32 v0, v248, v249 offset1:1
DS_WRITE2_B32_gfx9 $vgpr0, undef $vgpr248, undef $vgpr249, 0, 1, 0, implicit $exec
@@ -93,13 +93,13 @@ body: |
; GCN-NEXT: ds_load_b32 v0, v255 /*v511*/
$vgpr0 = DS_READ_B32_gfx9 $vgpr511, 0, 0, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 0x44
+ ; GCN-NEXT: s_set_vgpr_msb 0x144
; ASM-SAME: ; msbs: dst=1 src0=0 src1=1 src2=0
; GCN-NEXT: ds_add_rtn_u32 v255 /*v511*/, v0, v248 /*v504*/
$vgpr511 = DS_ADD_RTN_U32_gfx9 $vgpr0, undef $vgpr504, 0, 0, implicit $exec
; Reset
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4400
; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0
; GCN-NEXT: ds_add_rtn_u32 v0, v0, v0
$vgpr0 = DS_ADD_RTN_U32_gfx9 $vgpr0, $vgpr0, 0, 0, implicit $exec
@@ -111,17 +111,17 @@ body: |
; GCN-NEXT: global_load_b32 v2, v[2:3] /*v[258:259]*/, off
$vgpr2 = GLOBAL_LOAD_DWORD undef $vgpr258_vgpr259, 0, 0, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 64
+ ; GCN-NEXT: s_set_vgpr_msb 0x140
; ASM-SAME: ; msbs: dst=1 src0=0 src1=0 src2=0
; GCN-NEXT: global_load_b32 v255 /*v511*/, v0, s[0:1]
$vgpr511 = GLOBAL_LOAD_DWORD_SADDR undef $sgpr0_sgpr1, $vgpr0, 0, 0, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 1
+ ; GCN-NEXT: s_set_vgpr_msb 0x4001
; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=0
; GCN-NEXT: scratch_load_u8 v0, v255 /*v511*/, s0
$vgpr0 = SCRATCH_LOAD_UBYTE_SVS $vgpr511, undef $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x100
; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0
; GCN-NEXT: global_store_b32 v[0:1], v2, off
GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
@@ -135,13 +135,13 @@ body: |
; GCN-NEXT: global_store_b96 v[0:1] /*v[256:257]*/, v[244:246] /*v[500:502]*/, off
GLOBAL_STORE_DWORDX3 $vgpr256_vgpr257, $vgpr500_vgpr501_vgpr502, 0, 0, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 0x44
+ ; GCN-NEXT: s_set_vgpr_msb 0x544
; ASM-SAME: ; msbs: dst=1 src0=0 src1=1 src2=0
; GCN-NEXT: flat_atomic_add_u32 v254 /*v510*/, v[0:1], v255 /*v511*/ th:TH_ATOMIC_RETURN
$vgpr510 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr511, 0, 1, implicit $exec, implicit $flat_scr
; Reset
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4400
; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0
; GCN-NEXT: flat_atomic_add_u32 v0, v[0:1], v255 th:TH_ATOMIC_RETURN
$vgpr0 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr255, 0, 1, implicit $exec, implicit $flat_scr
@@ -156,12 +156,12 @@ body: |
; GCN-NEXT: buffer_load_b32 v1 /*v257*/, v0, s[8:11], s3 offen
$vgpr257 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN $vgpr0, undef $sgpr8_sgpr9_sgpr10_sgpr11, undef $sgpr3, 0, 0, 0, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 0x41
+ ; GCN-NEXT: s_set_vgpr_msb 0x4041
; ASM-SAME: ; msbs: dst=1 src0=1 src1=0 src2=0
; GCN-NEXT: buffer_load_b32 v1 /*v257*/, v0 /*v256*/, s[8:11], s3 offen
$vgpr257 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN $vgpr256, undef $sgpr8_sgpr9_sgpr10_sgpr11, undef $sgpr3, 0, 0, 0, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4100
; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0
; GCN-NEXT: buffer_store_b32 v0, v1, s[0:3], s3 offen
BUFFER_STORE_DWORD_VBUFFER_OFFEN $vgpr0, $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, 0, implicit $exec
@@ -171,7 +171,7 @@ body: |
; GCN-NEXT: buffer_store_b32 v0 /*v256*/, v1 /*v257*/, s[0:3], s3 offen
BUFFER_STORE_DWORD_VBUFFER_OFFEN $vgpr256, $vgpr257, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, 0, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4100
; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0
; GCN-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s3 offen
BUFFER_ATOMIC_ADD_F32_VBUFFER_OFFEN $vgpr0, $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, implicit $exec
@@ -183,44 +183,44 @@ body: |
; VGPRs above 512
- ; GCN-NEXT: s_set_vgpr_msb 0xaa
+ ; GCN-NEXT: s_set_vgpr_msb 0x41aa
; ASM-SAME: ; msbs: dst=2 src0=2 src1=2 src2=2
; GCN-NEXT: v_fma_f32 v0 /*v512*/, v1 /*v513*/, v2 /*v514*/, v3 /*v515*/
$vgpr512 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr514, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0xab
+ ; GCN-NEXT: s_set_vgpr_msb 0xaaab
; ASM-SAME: ; msbs: dst=2 src0=3 src1=2 src2=2
; GCN-NEXT: v_fma_f32 v0 /*v512*/, v0 /*v768*/, v2 /*v514*/, v3 /*v515*/
$vgpr512 = V_FMA_F32_e64 0, undef $vgpr768, 0, undef $vgpr514, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0xae
+ ; GCN-NEXT: s_set_vgpr_msb 0xabae
; ASM-SAME: ; msbs: dst=2 src0=2 src1=3 src2=2
; GCN-NEXT: v_fma_f32 v0 /*v512*/, v1 /*v513*/, v2 /*v770*/, v3 /*v515*/
$vgpr512 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr770, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0xba
+ ; GCN-NEXT: s_set_vgpr_msb 0xaeba
; ASM-SAME: ; msbs: dst=2 src0=2 src1=2 src2=3
; GCN-NEXT: v_fma_f32 v0 /*v512*/, v1 /*v513*/, v2 /*v514*/, v3 /*v771*/
$vgpr512 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr514, 0, undef $vgpr771, 0, 0, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0xea
+ ; GCN-NEXT: s_set_vgpr_msb 0xbaea
; ASM-SAME: ; msbs: dst=3 src0=2 src1=2 src2=2
; GCN-NEXT: v_fma_f32 v255 /*v1023*/, v1 /*v513*/, v2 /*v514*/, v3 /*v515*/
$vgpr1023 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr514, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0xff
+ ; GCN-NEXT: s_set_vgpr_msb 0xeaff
; ASM-SAME: ; msbs: dst=3 src0=3 src1=3 src2=3
; GCN-NEXT: v_fma_f32 v0 /*v768*/, v1 /*v769*/, v2 /*v770*/, v3 /*v771*/
$vgpr768 = V_FMA_F32_e64 0, undef $vgpr769, 0, undef $vgpr770, 0, undef $vgpr771, 0, 0, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0x42
+ ; GCN-NEXT: s_set_vgpr_msb 0xff42
; ASM-SAME: ; msbs: dst=1 src0=2 src1=0 src2=0
; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v0 /*v512*/
$vgpr256 = V_MOV_B32_e32 undef $vgpr512, implicit $exec
; Reset
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4200
; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0
; GCN-NEXT: v_fma_f32 v0, v1, v2, v3
$vgpr0 = V_FMA_F32_e64 0, undef $vgpr1, 0, undef $vgpr2, 0, undef $vgpr3, 0, 0, implicit $exec, implicit $mode
@@ -232,12 +232,12 @@ body: |
; GCN-NEXT: global_store_b96 v[0:1] /*v[512:513]*/, v[0:2] /*v[512:514]*/, off
GLOBAL_STORE_DWORDX3 $vgpr512_vgpr513, $vgpr512_vgpr513_vgpr514, 0, 0, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 11
+ ; GCN-NEXT: s_set_vgpr_msb 0xa0b
; ASM-SAME: ; msbs: dst=0 src0=3 src1=2 src2=0
; GCN-NEXT: global_store_b64 v[254:255] /*v[1022:1023]*/, v[254:255] /*v[766:767]*/, off
GLOBAL_STORE_DWORDX2 $vgpr1022_vgpr1023, $vgpr766_vgpr767, 0, 0, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 0x55
+ ; GCN-NEXT: s_set_vgpr_msb 0xb55
; ASM-SAME: ; msbs: dst=1 src0=1 src1=1 src2=1
; GCN-NEXT: v_wmma_f32_16x16x32_bf16 v[14:21] /*v[270:277]*/, v[26:33] /*v[282:289]*/, v[34:41] /*v[290:297]*/, v[14:21] /*v[270:277]*/
early-clobber $vgpr270_vgpr271_vgpr272_vgpr273_vgpr274_vgpr275_vgpr276_vgpr277 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, undef $vgpr282_vgpr283_vgpr284_vgpr285_vgpr286_vgpr287_vgpr288_vgpr289, 8, undef $vgpr290_vgpr291_vgpr292_vgpr293_vgpr294_vgpr295_vgpr296_vgpr297, 8, killed undef $vgpr270_vgpr271_vgpr272_vgpr273_vgpr274_vgpr275_vgpr276_vgpr277, 0, 0, 0, 0, implicit $exec
@@ -247,6 +247,7 @@ body: |
...
# ASM-LABEL: {{^}}vopd:
+
# DIS-LABEL: <vopd>:
---
name: vopd
@@ -262,35 +263,35 @@ body: |
; GCN-NEXT: v_dual_sub_f32 v244 /*v500*/, v1, v2 :: v_dual_mul_f32 v0 /*v256*/, v3, v4
$vgpr500, $vgpr256 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr2, undef $vgpr3, undef $vgpr4, implicit $mode, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 0x41
+ ; GCN-NEXT: s_set_vgpr_msb 0x4041
; GCN-NEXT: v_dual_sub_f32 v244 /*v500*/, s1, v2 :: v_dual_mul_f32 v0 /*v256*/, v44 /*v300*/, v4
$vgpr500, $vgpr256 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $sgpr1, undef $vgpr2, undef $vgpr300, undef $vgpr4, implicit $mode, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 4
+ ; GCN-NEXT: s_set_vgpr_msb 0x4104
; GCN-NEXT: v_dual_sub_f32 v255, v1, v44 /*v300*/ :: v_dual_mul_f32 v6, v0, v1 /*v257*/
$vgpr255, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr300, undef $vgpr0, $vgpr257, implicit $mode, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 1
+ ; GCN-NEXT: s_set_vgpr_msb 0x401
; GCN-NEXT: v_dual_sub_f32 v255, 0, v1 :: v_dual_mul_f32 v6, v44 /*v300*/, v3
$vgpr255, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 0, undef $vgpr1, undef $vgpr300, undef $vgpr3, implicit $mode, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 64
+ ; GCN-NEXT: s_set_vgpr_msb 0x140
; GCN-NEXT: v_dual_fmamk_f32 v243 /*v499*/, v0, 0xa, v3 :: v_dual_fmac_f32 v0 /*v256*/, v1, v1
$vgpr499, $vgpr256 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx1250 undef $vgpr0, 10, undef $vgpr3, undef $vgpr1, undef $vgpr1, $vgpr256, implicit $mode, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 5
+ ; GCN-NEXT: s_set_vgpr_msb 0x4005
; GCN-NEXT: v_dual_mov_b32 v2, v3 /*v259*/ :: v_dual_add_f32 v3, v1 /*v257*/, v2 /*v258*/
$vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx1250 undef $vgpr259, undef $vgpr257, undef $vgpr258, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0x44
+ ; GCN-NEXT: s_set_vgpr_msb 0x544
; GCN-NEXT: v_dual_fmamk_f32 v244 /*v500*/, v0, 0xa, v44 /*v300*/ :: v_dual_fmac_f32 v3 /*v259*/, v1, v1 /*v257*/
$vgpr500, $vgpr259 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx1250 undef $vgpr0, 10, undef $vgpr300, undef $vgpr1, undef $vgpr257, $vgpr259, implicit $mode, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 16
+ ; GCN-NEXT: s_set_vgpr_msb 0x4410
; GCN-NEXT: v_dual_fma_f32 v0, v6, v6, v44 /*v300*/ :: v_dual_fma_f32 v1, v4, v5, v45 /*v301*/
$vgpr0, $vgpr1 = V_DUAL_FMA_F32_e64_X_FMA_F32_e64_e96_gfx1250 0, undef $vgpr6, 0, undef $vgpr6, 0, undef $vgpr300, 0, undef $vgpr4, 0, undef $vgpr5, 0, undef $vgpr301, implicit $mode, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x1000
; GCN-NEXT: v_dual_fmac_f32 v2, v6, v6 :: v_dual_fma_f32 v3, v4, v5, v3
$vgpr2, $vgpr3 = V_DUAL_FMAC_F32_e32_X_FMA_F32_e64_e96_gfx1250 0, undef $vgpr6, 0, undef $vgpr6, undef $vgpr2, 0, undef $vgpr4, 0, undef $vgpr5, 0, $vgpr3, implicit $mode, implicit $exec
@@ -298,7 +299,7 @@ body: |
; GCN-NEXT: v_dual_fma_f32 v244 /*v500*/, v6, v7, v8 :: v_dual_add_f32 v3 /*v259*/, v4, v5
$vgpr500, $vgpr259 = V_DUAL_FMA_F32_e64_X_ADD_F32_e32_e96_gfx1250 0, undef $vgpr6, 0, undef $vgpr7, 0, undef $vgpr8, 0, undef $vgpr4, 0, undef $vgpr5, implicit $mode, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 0xae
+ ; GCN-NEXT: s_set_vgpr_msb 0x40ae
; GCN-NEXT: v_dual_fmac_f32 v2 /*v514*/, v6 /*v518*/, v8 /*v776*/ :: v_dual_fma_f32 v3 /*v515*/, v4 /*v516*/, v7 /*v775*/, v3 /*v515*/
$vgpr514, $vgpr515 = V_DUAL_FMAC_F32_e32_X_FMA_F32_e64_e96_gfx1250 0, undef $vgpr518, 0, undef $vgpr776, undef $vgpr514, 0, undef $vgpr516, 0, undef $vgpr775, 0, $vgpr515, implicit $mode, implicit $exec
@@ -319,31 +320,31 @@ body: |
; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v1 /*v257*/, v2 /*v258*/, 0x1
$vgpr256 = V_FMAAK_F32 undef $vgpr257, undef $vgpr258, 1, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 5
+ ; GCN-NEXT: s_set_vgpr_msb 0x4505
; GCN-NEXT: v_fmaak_f32 v0, v1 /*v257*/, v2 /*v258*/, 0x1
$vgpr0 = V_FMAAK_F32 undef $vgpr257, undef $vgpr258, 1, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0x41
+ ; GCN-NEXT: s_set_vgpr_msb 0x541
; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v1 /*v257*/, v2, 0x1
$vgpr256 = V_FMAAK_F32 undef $vgpr257, undef $vgpr2, 1, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0x44
+ ; GCN-NEXT: s_set_vgpr_msb 0x4144
; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v1, v2 /*v258*/, 0x1
$vgpr256 = V_FMAAK_F32 undef $vgpr1, undef $vgpr258, 1, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0x45
+ ; GCN-NEXT: s_set_vgpr_msb 0x4445
; GCN-NEXT: v_fmamk_f32 v0 /*v256*/, v1 /*v257*/, 0x1, v2 /*v258*/
$vgpr256 = V_FMAMK_F32 undef $vgpr257, 1, undef $vgpr258, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 5
+ ; GCN-NEXT: s_set_vgpr_msb 0x4505
; GCN-NEXT: v_fmamk_f32 v0, v1 /*v257*/, 0x1, v2 /*v258*/
$vgpr0 = V_FMAMK_F32 undef $vgpr257, 1, undef $vgpr258, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0x41
+ ; GCN-NEXT: s_set_vgpr_msb 0x541
; GCN-NEXT: v_fmamk_f32 v0 /*v256*/, v1 /*v257*/, 0x1, v2
$vgpr256 = V_FMAMK_F32 undef $vgpr257, 1, undef $vgpr2, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0x44
+ ; GCN-NEXT: s_set_vgpr_msb 0x4144
; GCN-NEXT: v_fmamk_f32 v0 /*v256*/, v1, 0x1, v2 /*v258*/
$vgpr256 = V_FMAMK_F32 undef $vgpr1, 1, undef $vgpr258, implicit $exec, implicit $mode
@@ -389,15 +390,15 @@ body: |
; GCN-NEXT: v_lshlrev_b32_e64 v0, v0 /*v256*/, v2
$vgpr0 = V_LSHLREV_B32_e64 undef $vgpr256, undef $vgpr2, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 4
+ ; GCN-NEXT: s_set_vgpr_msb 0x104
; GCN-NEXT: v_lshlrev_b32_e64 v0, v1, v0 /*v256*/
$vgpr0 = V_LSHLREV_B32_e64 undef $vgpr1, undef $vgpr256, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 1
+ ; GCN-NEXT: s_set_vgpr_msb 0x401
; GCN-NEXT: v_subrev_nc_u32_e32 v0, v0 /*v256*/, v2
$vgpr0 = V_SUBREV_U32_e32 undef $vgpr256, undef $vgpr2, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 4
+ ; GCN-NEXT: s_set_vgpr_msb 0x104
; GCN-NEXT: v_subrev_nc_u32_e32 v0, v1, v0 /*v256*/
$vgpr0 = V_SUBREV_U32_e32 undef $vgpr1, undef $vgpr256, implicit $exec
@@ -417,7 +418,7 @@ body: |
; GCN-NEXT: v_fma_f32 v3 /*v259*/, v4 /*v260*/, v5 /*v261*/, v6 /*v262*/
$vgpr259 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr261, 0, undef $vgpr262, 0, 0, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x5500
; GCN-NEXT: v_add_nc_u32_e32 v0, v1, v2
$vgpr0 = V_ADD_U32_e32 undef $vgpr1, undef $vgpr2, implicit $exec
@@ -431,7 +432,7 @@ body: |
; GCN-NEXT: v_add_nc_u32_e32 v0 /*v256*/, v1, v2
$vgpr256 = V_ADD_U32_e32 undef $vgpr1, undef $vgpr2, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4000
; GCN-NEXT: v_fma_f32 v3, v4, v5, s2
$vgpr3 = V_FMA_F32_e64 0, undef $vgpr4, 0, undef $vgpr5, 0, undef $sgpr2, 0, 0, implicit $exec, implicit $mode
@@ -439,17 +440,17 @@ body: |
; GCN-NEXT: v_fma_f32 v3, v4 /*v260*/, v5, 1
$vgpr3 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr5, 0, 1, 0, 0, implicit $exec, implicit $mode
- ; GCN-NEXT: s_set_vgpr_msb 4
+ ; GCN-NEXT: s_set_vgpr_msb 0x104
; GCN-NEXT: v_mov_b32_e32 v0, v1
$vgpr0 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
; GCN-NEXT: v_add_nc_u32_e32 v2, v1, v3 /*v259*/
$vgpr2 = V_ADD_U32_e32 undef $vgpr1, undef $vgpr259, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 1
+ ; GCN-NEXT: s_set_vgpr_msb 0x401
; GCN-NEXT: v_mov_b32_e32 v0, v0 /*v256*/
; GCN-NEXT: v_add_nc_u32_e32 v1, v1 /*v257*/, v1
- ; GCN-NEXT: s_set_vgpr_msb 5
+ ; GCN-NEXT: s_set_vgpr_msb 0x105
; GCN-NEXT: v_add_nc_u32_e32 v2, v2 /*v258*/, v2 /*v258*/
$vgpr0 = V_MOV_B32_e32 undef $vgpr256, implicit $exec
$vgpr1 = V_ADD_U32_e32 undef $vgpr257, undef $vgpr1, implicit $exec
@@ -478,16 +479,18 @@ body: |
; ASM: .LBB{{.*_1}}:
; GCN-NEXT: s_set_vgpr_msb 64
; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
+ ; GCN-NEXT: s_set_vgpr_msb 0x4000
$vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
- ; No mode switch on fall through
+ ; Reset on fallthrough block end
bb.2:
; ASM-NEXT: %bb.2:
- ; GCN-NEXT: s_nop 0
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 64
+ ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
+ ; GCN-NEXT: s_set_vgpr_msb 0x4000
; GCN-NEXT: s_branch
- S_NOP 0
+ $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
S_BRANCH %bb.3
; Reset mode on terminator
@@ -496,7 +499,7 @@ body: |
; ASM: .LBB{{.*_3}}:
; GCN-NEXT: s_set_vgpr_msb 64
; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4000
; GCN-NEXT: s_swap_pc_i64
$vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
$exec = S_SWAPPC_B64 undef $sgpr0_sgpr1
@@ -518,7 +521,7 @@ body: |
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_set_vgpr_msb 64
; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4000
; GCN-NEXT: s_set_pc_i64
$vgpr0 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
$vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
@@ -538,7 +541,7 @@ body: |
; ASM-NEXT: %bb.7:
; GCN-NEXT: s_set_vgpr_msb 64
; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4000
; ASM-NEXT: ; return to shader part epilog
$vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
SI_RETURN_TO_EPILOG undef $vgpr0, implicit-def $exec
@@ -556,7 +559,7 @@ body: |
; ASM-NEXT: %bb.9:
; GCN-NEXT: s_set_vgpr_msb 64
; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4000
; GCN-NEXT: s_set_pc_i64
$vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
S_SETPC_B64_return undef $sgpr0_sgpr1, implicit-def $exec
@@ -574,13 +577,14 @@ body: |
; ASM: %bb.0:
; GCN-NEXT: s_set_vgpr_msb 64
; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4000
$vgpr256 = V_MOV_B32_e32 undef $vgpr0, implicit $exec
bb.1:
; ASM: .LBB{{[0-9]+}}_1:
; GCN-NEXT: s_set_vgpr_msb 64
; GCN-NEXT: v_mov_b32_e32 v1 /*v257*/, v1
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4000
; GCN-NEXT: s_cbranch_scc0
$vgpr257 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
S_CBRANCH_SCC0 %bb.1, undef implicit $scc
@@ -604,7 +608,7 @@ body: |
; ASM: %bb.0:
; GCN-NEXT: s_set_vgpr_msb 64
; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4000
; ASM: def v0
; GCN-NOT: s_set_vgpr_msb
; ASM: use v0
@@ -638,7 +642,7 @@ body: |
; GCN-NEXT: s_set_vgpr_msb 64
; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
; GCN-NEXT: s_nop 0
- ; GCN-NEXT: s_set_vgpr_msb 1
+ ; GCN-NEXT: s_set_vgpr_msb 0x4001
; GCN-NEXT: v_mov_b32_e32 v1, v0 /*v256*/
BUNDLE implicit-def $vgpr256 {
$vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
@@ -680,7 +684,7 @@ body: |
; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
; GCN-NEXT: v_mov_b32_e32 v1 /*v257*/, v1
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4000
; GCN-NEXT: v_mov_b32_e32 v2, v1
; GCN-NEXT: v_mov_b32_e32 v3, v1
BUNDLE implicit-def $vgpr256, implicit-def $vgpr257, implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr1 {
@@ -709,7 +713,7 @@ body: |
; GCN-NEXT: s_clause 0x3e
; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x4000
; GCN-NEXT: v_mov_b32_e32 v1, v1
; GCN-NEXT: v_mov_b32_e32 v2, v1
; GCN-COUNT-60: v_mov_b32_e32 v1, v1
@@ -823,7 +827,7 @@ body: |
; GCN-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[210:217], v[244:259] /*v[500:515]*/, v[244:259] /*v[500:515]*/, v[10:17], v1, v2
$vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr1, undef $vgpr2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x500
; GCN-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[210:217], v[100:115], v[100:115], v[10:17], v1, v2
$vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr1, undef $vgpr2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
@@ -835,11 +839,11 @@ body: |
; GCN-NEXT: v_wmma_ld_scale16_paired_b64 v[0:1], v[2:3]
V_WMMA_LD_SCALE16_PAIRED_B64 undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 5
+ ; GCN-NEXT: s_set_vgpr_msb 0x105
; GCN-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[210:217], v[244:259] /*v[500:515]*/, v[244:259] /*v[500:515]*/, v[10:17], v[0:1], v[2:3]
$vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
- ; GCN-NEXT: s_set_vgpr_msb 0
+ ; GCN-NEXT: s_set_vgpr_msb 0x500
; GCN-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[210:217], v[100:115], v[100:115], v[10:17], v[0:1], v[2:3]
$vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
index a42c8ac7..7581710 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
@@ -3182,7 +3182,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v509*/, s33 offset:1592
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v510*/, s33 offset:1596
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v511*/, s33 offset:1600
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 8 ; msbs: dst=0 src0=0 src1=2 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x408 ; msbs: dst=0 src0=0 src1=2 src2=0
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v512*/, s33 offset:1604
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v513*/, s33 offset:1608
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v514*/, s33 offset:1612
@@ -3443,7 +3443,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v765*/, s33 offset:2616
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v766*/, s33 offset:2620
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v767*/, s33 offset:2624
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 12 ; msbs: dst=0 src0=0 src1=3 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c ; msbs: dst=0 src0=0 src1=3 src2=0
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v768*/, s33 offset:2628
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v769*/, s33 offset:2632
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v770*/, s33 offset:2636
@@ -3706,7 +3706,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v1023*/, s33 offset:3648
; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, -1
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc00 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-DAGISEL-NEXT: v_writelane_b32 v40, s0, 3
@@ -4135,7 +4135,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v509*/, off, s33 offset:1592
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v510*/, off, s33 offset:1596
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v511*/, off, s33 offset:1600
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80 ; msbs: dst=2 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x4080 ; msbs: dst=2 src0=0 src1=0 src2=0
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v512*/, off, s33 offset:1604
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v513*/, off, s33 offset:1608
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v514*/, off, s33 offset:1612
@@ -4396,7 +4396,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v765*/, off, s33 offset:2616
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v766*/, off, s33 offset:2620
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v767*/, off, s33 offset:2624
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc0 ; msbs: dst=3 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c0 ; msbs: dst=3 src0=0 src1=0 src2=0
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v768*/, off, s33 offset:2628
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v769*/, off, s33 offset:2632
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v770*/, off, s33 offset:2636
@@ -4661,7 +4661,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, s4
; GFX1250-DAGISEL-NEXT: s_mov_b32 s33, s0
; GFX1250-DAGISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc000 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-DAGISEL-NEXT: s_set_pc_i64 s[30:31]
%ret = call amdgpu_gfx <2 x half>(<2 x half>, <2 x half>) @gfx_callee(<2 x half> %y, <2 x half> %x) convergent
ret <2 x half> %ret
@@ -6346,7 +6346,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v509*/, s32 offset:1588
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v510*/, s32 offset:1592
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v511*/, s32 offset:1596
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 8 ; msbs: dst=0 src0=0 src1=2 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x408 ; msbs: dst=0 src0=0 src1=2 src2=0
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v512*/, s32 offset:1600
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v513*/, s32 offset:1604
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v514*/, s32 offset:1608
@@ -6607,7 +6607,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v765*/, s32 offset:2612
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v766*/, s32 offset:2616
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v767*/, s32 offset:2620
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 12 ; msbs: dst=0 src0=0 src1=3 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c ; msbs: dst=0 src0=0 src1=3 src2=0
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v768*/, s32 offset:2624
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v769*/, s32 offset:2628
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v770*/, s32 offset:2632
@@ -6872,7 +6872,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, -1
; GFX1250-DAGISEL-NEXT: v_mov_b32_e32 v2, v0
; GFX1250-DAGISEL-NEXT: s_mov_b64 s[36:37], gfx_callee@abs64
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc00 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-DAGISEL-NEXT: v_swap_b32 v0, v1
; GFX1250-DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1
; GFX1250-DAGISEL-NEXT: s_clause 0x3e
@@ -7283,7 +7283,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v509*/, off, s32 offset:1588
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v510*/, off, s32 offset:1592
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v511*/, off, s32 offset:1596
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80 ; msbs: dst=2 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x4080 ; msbs: dst=2 src0=0 src1=0 src2=0
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v512*/, off, s32 offset:1600
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v513*/, off, s32 offset:1604
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v514*/, off, s32 offset:1608
@@ -7544,7 +7544,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v765*/, off, s32 offset:2612
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v766*/, off, s32 offset:2616
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v767*/, off, s32 offset:2620
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc0 ; msbs: dst=3 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c0 ; msbs: dst=3 src0=0 src1=0 src2=0
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v768*/, off, s32 offset:2624
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v769*/, off, s32 offset:2628
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v770*/, off, s32 offset:2632
@@ -7807,7 +7807,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v1023*/, off, s32 offset:3644
; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, s0
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc000 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-DAGISEL-NEXT: s_set_pc_i64 s[36:37]
%ret = tail call amdgpu_gfx <2 x half>(<2 x half>, <2 x half>) @gfx_callee(<2 x half> %y, <2 x half> %x) convergent
ret <2 x half> %ret
@@ -9657,7 +9657,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v509*/, s33 offset:1600
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v510*/, s33 offset:1604
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v511*/, s33 offset:1608
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 8 ; msbs: dst=0 src0=0 src1=2 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x408 ; msbs: dst=0 src0=0 src1=2 src2=0
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v512*/, s33 offset:1612
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v513*/, s33 offset:1616
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v514*/, s33 offset:1620
@@ -9918,7 +9918,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v765*/, s33 offset:2624
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v766*/, s33 offset:2628
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v767*/, s33 offset:2632
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 12 ; msbs: dst=0 src0=0 src1=3 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c ; msbs: dst=0 src0=0 src1=3 src2=0
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v768*/, s33 offset:2636
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v769*/, s33 offset:2640
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v770*/, s33 offset:2644
@@ -10181,7 +10181,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v1023*/, s33 offset:3656
; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, -1
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc00 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-DAGISEL-NEXT: s_clause 0x2
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v42, s33
; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v40, s33 offset:164
@@ -10616,7 +10616,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v509*/, off, s33 offset:1600
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v510*/, off, s33 offset:1604
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v511*/, off, s33 offset:1608
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80 ; msbs: dst=2 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x4080 ; msbs: dst=2 src0=0 src1=0 src2=0
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v512*/, off, s33 offset:1612
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v513*/, off, s33 offset:1616
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v514*/, off, s33 offset:1620
@@ -10877,7 +10877,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v765*/, off, s33 offset:2624
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v766*/, off, s33 offset:2628
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v767*/, off, s33 offset:2632
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc0 ; msbs: dst=3 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c0 ; msbs: dst=3 src0=0 src1=0 src2=0
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v768*/, off, s33 offset:2636
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v769*/, off, s33 offset:2640
; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v770*/, off, s33 offset:2644
@@ -11142,7 +11142,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, s4
; GFX1250-DAGISEL-NEXT: s_mov_b32 s33, s0
; GFX1250-DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc000 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-DAGISEL-NEXT: s_set_pc_i64 s[30:31]
%ret = call float(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @callee, <8 x float> %x) convergent
store float %ret, ptr %p
diff --git a/llvm/test/CodeGen/Hexagon/isel-fclass.ll b/llvm/test/CodeGen/Hexagon/isel-fclass.ll
new file mode 100644
index 0000000..96b0210
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/isel-fclass.ll
@@ -0,0 +1,86 @@
+; Tests lowering of sfclass/dfclass compares.
+; Sub-optimal code
+; {
+; p0 = sfclass(r0,#16)
+; r0 = sfadd(r0,r0)
+; }
+; {
+; r2 = p0
+; }
+; {
+; if (p0.new) r0 = ##1065353216
+; p0 = cmp.eq(r2,#0)
+; jumpr r31
+; }
+; With the patterns added, we should be generating
+; {
+; p0 = sfclass(r0,#16)
+; r0 = sfadd(r0,r0)
+; }
+; {
+; if (!p0) r0 = ##1065353216
+; jumpr r31
+; }
+
+; RUN: llc -march=hexagon -stop-after=hexagon-isel %s -o - | FileCheck %s
+
+; CHECK: bb.0.entry1
+; CHECK: F2_sfclass
+; CHECK-NOT: C2_cmp
+; CHECK: C2_not
+; CHECK: F2_sfadd
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+define float @test1(float noundef %x) {
+entry1:
+ %0 = tail call i32 @llvm.hexagon.F2.sfclass(float %x, i32 16)
+ %tobool.not = icmp eq i32 %0, 0
+ %add = fadd float %x, %x
+ %spec.select = select i1 %tobool.not, float 1.000000e+00, float %add
+ ret float %spec.select
+}
+
+; CHECK: bb.0.entry2
+; CHECK: F2_sfclass
+; CHECK-NOT: C2_cmp
+; CHECK: F2_sfadd
+define float @test2(float noundef %x) {
+entry2:
+ %0 = tail call i32 @llvm.hexagon.F2.sfclass(float %x, i32 16)
+ %tobool.not = icmp eq i32 %0, 0
+ %add = fadd float %x, %x
+ %spec.select = select i1 %tobool.not, float %add, float 1.000000e+00
+ ret float %spec.select
+}
+
+; CHECK: bb.0.entry3
+; CHECK: F2_dfclass
+; CHECK-NOT: C2_cmp
+; CHECK: C2_not
+; CHECK: F2_dfadd
+define double @test3(double noundef %x) {
+entry3:
+ %0 = tail call i32 @llvm.hexagon.F2.dfclass(double %x, i32 16)
+ %tobool.not = icmp eq i32 %0, 0
+ %add = fadd double %x, %x
+ %spec.select = select i1 %tobool.not, double 1.000000e+00, double %add
+ ret double %spec.select
+}
+
+; CHECK: bb.0.entry4
+; CHECK: F2_dfclass
+; CHECK-NOT: C2_cmp
+; CHECK: F2_dfadd
+define double @test4(double noundef %x) {
+entry4:
+ %0 = tail call i32 @llvm.hexagon.F2.dfclass(double %x, i32 16)
+ %tobool.not = icmp eq i32 %0, 0
+ %add = fadd double %x, %x
+ %spec.select = select i1 %tobool.not, double %add, double 1.000000e+00
+ ret double %spec.select
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare i32 @llvm.hexagon.F2.dfclass(double, i32 immarg)
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare i32 @llvm.hexagon.F2.sfclass(float, i32 immarg)
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll b/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll
index ba2118f..b3155c9 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll
@@ -106,6 +106,69 @@ define void @ctlz_v4i64(ptr %src, ptr %dst) nounwind {
ret void
}
+define void @not_ctlz_v32i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v32i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xvld $xr0, $a0, 0
+; CHECK-NEXT: xvxori.b $xr0, $xr0, 255
+; CHECK-NEXT: xvclz.b $xr0, $xr0
+; CHECK-NEXT: xvst $xr0, $a1, 0
+; CHECK-NEXT: ret
+ %v = load <32 x i8>, ptr %src
+ %neg = xor <32 x i8> %v, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ %res = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %neg, i1 false)
+ store <32 x i8> %res, ptr %dst
+ ret void
+}
+
+define void @not_ctlz_v16i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v16i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xvld $xr0, $a0, 0
+; CHECK-NEXT: xvrepli.b $xr1, -1
+; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT: xvclz.h $xr0, $xr0
+; CHECK-NEXT: xvst $xr0, $a1, 0
+; CHECK-NEXT: ret
+ %v = load <16 x i16>, ptr %src
+ %neg = xor <16 x i16> %v, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+ %res = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %neg, i1 false)
+ store <16 x i16> %res, ptr %dst
+ ret void
+}
+
+define void @not_ctlz_v8i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v8i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xvld $xr0, $a0, 0
+; CHECK-NEXT: xvrepli.b $xr1, -1
+; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT: xvclz.w $xr0, $xr0
+; CHECK-NEXT: xvst $xr0, $a1, 0
+; CHECK-NEXT: ret
+ %v = load <8 x i32>, ptr %src
+ %neg = xor <8 x i32> %v, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+ %res = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %neg, i1 false)
+ store <8 x i32> %res, ptr %dst
+ ret void
+}
+
+define void @not_ctlz_v4i64(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v4i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xvld $xr0, $a0, 0
+; CHECK-NEXT: xvrepli.b $xr1, -1
+; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT: xvclz.d $xr0, $xr0
+; CHECK-NEXT: xvst $xr0, $a1, 0
+; CHECK-NEXT: ret
+ %v = load <4 x i64>, ptr %src
+ %neg = xor <4 x i64> %v, <i64 -1, i64 -1, i64 -1, i64 -1>
+ %res = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %neg, i1 false)
+ store <4 x i64> %res, ptr %dst
+ ret void
+}
+
declare <32 x i8> @llvm.ctpop.v32i8(<32 x i8>)
declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>)
declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>)
diff --git a/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll b/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll
index 79407c3..fa5f27e 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll
@@ -7,38 +7,8 @@ define void @ceil_v8f32(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: ceil_v8f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xvld $xr0, $a1, 0
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 5
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrp.s $vr1, $vr1
-; CHECK-NEXT: xvpickve.w $xr2, $xr0, 4
-; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0
-; CHECK-NEXT: vfrintrp.s $vr2, $vr2
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 16
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 6
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrp.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 32
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 7
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrp.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 48
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 1
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrp.s $vr1, $vr1
-; CHECK-NEXT: xvpickve.w $xr3, $xr0, 0
-; CHECK-NEXT: vreplvei.w $vr3, $vr3, 0
-; CHECK-NEXT: vfrintrp.s $vr3, $vr3
-; CHECK-NEXT: vextrins.w $vr3, $vr1, 16
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 2
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrp.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr3, $vr1, 32
-; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0
-; CHECK-NEXT: vfrintrp.s $vr0, $vr0
-; CHECK-NEXT: vextrins.w $vr3, $vr0, 48
-; CHECK-NEXT: xvpermi.q $xr3, $xr2, 2
-; CHECK-NEXT: xvst $xr3, $a0, 0
+; CHECK-NEXT: xvfrintrp.s $xr0, $xr0
+; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
entry:
%v0 = load <8 x float>, ptr %a0
@@ -52,21 +22,7 @@ define void @ceil_v4f64(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: ceil_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xvld $xr0, $a1, 0
-; CHECK-NEXT: xvpickve.d $xr1, $xr0, 3
-; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrp.d $vr1, $vr1
-; CHECK-NEXT: xvpickve.d $xr2, $xr0, 2
-; CHECK-NEXT: vreplvei.d $vr2, $vr2, 0
-; CHECK-NEXT: vfrintrp.d $vr2, $vr2
-; CHECK-NEXT: vextrins.d $vr2, $vr1, 16
-; CHECK-NEXT: xvpickve.d $xr1, $xr0, 1
-; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrp.d $vr1, $vr1
-; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT: vfrintrp.d $vr0, $vr0
-; CHECK-NEXT: vextrins.d $vr0, $vr1, 16
-; CHECK-NEXT: xvpermi.q $xr0, $xr2, 2
+; CHECK-NEXT: xvfrintrp.d $xr0, $xr0
; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
entry:
@@ -81,38 +37,8 @@ define void @floor_v8f32(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: floor_v8f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xvld $xr0, $a1, 0
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 5
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrm.s $vr1, $vr1
-; CHECK-NEXT: xvpickve.w $xr2, $xr0, 4
-; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0
-; CHECK-NEXT: vfrintrm.s $vr2, $vr2
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 16
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 6
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrm.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 32
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 7
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrm.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 48
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 1
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrm.s $vr1, $vr1
-; CHECK-NEXT: xvpickve.w $xr3, $xr0, 0
-; CHECK-NEXT: vreplvei.w $vr3, $vr3, 0
-; CHECK-NEXT: vfrintrm.s $vr3, $vr3
-; CHECK-NEXT: vextrins.w $vr3, $vr1, 16
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 2
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrm.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr3, $vr1, 32
-; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0
-; CHECK-NEXT: vfrintrm.s $vr0, $vr0
-; CHECK-NEXT: vextrins.w $vr3, $vr0, 48
-; CHECK-NEXT: xvpermi.q $xr3, $xr2, 2
-; CHECK-NEXT: xvst $xr3, $a0, 0
+; CHECK-NEXT: xvfrintrm.s $xr0, $xr0
+; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
entry:
%v0 = load <8 x float>, ptr %a0
@@ -126,21 +52,7 @@ define void @floor_v4f64(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: floor_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xvld $xr0, $a1, 0
-; CHECK-NEXT: xvpickve.d $xr1, $xr0, 3
-; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrm.d $vr1, $vr1
-; CHECK-NEXT: xvpickve.d $xr2, $xr0, 2
-; CHECK-NEXT: vreplvei.d $vr2, $vr2, 0
-; CHECK-NEXT: vfrintrm.d $vr2, $vr2
-; CHECK-NEXT: vextrins.d $vr2, $vr1, 16
-; CHECK-NEXT: xvpickve.d $xr1, $xr0, 1
-; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrm.d $vr1, $vr1
-; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT: vfrintrm.d $vr0, $vr0
-; CHECK-NEXT: vextrins.d $vr0, $vr1, 16
-; CHECK-NEXT: xvpermi.q $xr0, $xr2, 2
+; CHECK-NEXT: xvfrintrm.d $xr0, $xr0
; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
entry:
@@ -155,38 +67,8 @@ define void @trunc_v8f32(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: trunc_v8f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xvld $xr0, $a1, 0
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 5
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrz.s $vr1, $vr1
-; CHECK-NEXT: xvpickve.w $xr2, $xr0, 4
-; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0
-; CHECK-NEXT: vfrintrz.s $vr2, $vr2
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 16
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 6
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrz.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 32
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 7
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrz.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 48
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 1
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrz.s $vr1, $vr1
-; CHECK-NEXT: xvpickve.w $xr3, $xr0, 0
-; CHECK-NEXT: vreplvei.w $vr3, $vr3, 0
-; CHECK-NEXT: vfrintrz.s $vr3, $vr3
-; CHECK-NEXT: vextrins.w $vr3, $vr1, 16
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 2
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrz.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr3, $vr1, 32
-; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0
-; CHECK-NEXT: vfrintrz.s $vr0, $vr0
-; CHECK-NEXT: vextrins.w $vr3, $vr0, 48
-; CHECK-NEXT: xvpermi.q $xr3, $xr2, 2
-; CHECK-NEXT: xvst $xr3, $a0, 0
+; CHECK-NEXT: xvfrintrz.s $xr0, $xr0
+; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
entry:
%v0 = load <8 x float>, ptr %a0
@@ -200,21 +82,7 @@ define void @trunc_v4f64(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: trunc_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xvld $xr0, $a1, 0
-; CHECK-NEXT: xvpickve.d $xr1, $xr0, 3
-; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrz.d $vr1, $vr1
-; CHECK-NEXT: xvpickve.d $xr2, $xr0, 2
-; CHECK-NEXT: vreplvei.d $vr2, $vr2, 0
-; CHECK-NEXT: vfrintrz.d $vr2, $vr2
-; CHECK-NEXT: vextrins.d $vr2, $vr1, 16
-; CHECK-NEXT: xvpickve.d $xr1, $xr0, 1
-; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrz.d $vr1, $vr1
-; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT: vfrintrz.d $vr0, $vr0
-; CHECK-NEXT: vextrins.d $vr0, $vr1, 16
-; CHECK-NEXT: xvpermi.q $xr0, $xr2, 2
+; CHECK-NEXT: xvfrintrz.d $xr0, $xr0
; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
entry:
@@ -229,38 +97,8 @@ define void @roundeven_v8f32(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: roundeven_v8f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xvld $xr0, $a1, 0
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 5
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrne.s $vr1, $vr1
-; CHECK-NEXT: xvpickve.w $xr2, $xr0, 4
-; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0
-; CHECK-NEXT: vfrintrne.s $vr2, $vr2
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 16
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 6
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrne.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 32
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 7
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrne.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 48
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 1
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrne.s $vr1, $vr1
-; CHECK-NEXT: xvpickve.w $xr3, $xr0, 0
-; CHECK-NEXT: vreplvei.w $vr3, $vr3, 0
-; CHECK-NEXT: vfrintrne.s $vr3, $vr3
-; CHECK-NEXT: vextrins.w $vr3, $vr1, 16
-; CHECK-NEXT: xvpickve.w $xr1, $xr0, 2
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrne.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr3, $vr1, 32
-; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0
-; CHECK-NEXT: vfrintrne.s $vr0, $vr0
-; CHECK-NEXT: vextrins.w $vr3, $vr0, 48
-; CHECK-NEXT: xvpermi.q $xr3, $xr2, 2
-; CHECK-NEXT: xvst $xr3, $a0, 0
+; CHECK-NEXT: xvfrintrne.s $xr0, $xr0
+; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
entry:
%v0 = load <8 x float>, ptr %a0
@@ -274,21 +112,7 @@ define void @roundeven_v4f64(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: roundeven_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xvld $xr0, $a1, 0
-; CHECK-NEXT: xvpickve.d $xr1, $xr0, 3
-; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrne.d $vr1, $vr1
-; CHECK-NEXT: xvpickve.d $xr2, $xr0, 2
-; CHECK-NEXT: vreplvei.d $vr2, $vr2, 0
-; CHECK-NEXT: vfrintrne.d $vr2, $vr2
-; CHECK-NEXT: vextrins.d $vr2, $vr1, 16
-; CHECK-NEXT: xvpickve.d $xr1, $xr0, 1
-; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrne.d $vr1, $vr1
-; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT: vfrintrne.d $vr0, $vr0
-; CHECK-NEXT: vextrins.d $vr0, $vr1, 16
-; CHECK-NEXT: xvpermi.q $xr0, $xr2, 2
+; CHECK-NEXT: xvfrintrne.d $xr0, $xr0
; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll b/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll
index a9a38e8..6ac7d51 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll
@@ -106,6 +106,69 @@ define void @ctlz_v2i64(ptr %src, ptr %dst) nounwind {
ret void
}
+define void @not_ctlz_v16i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v16i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vld $vr0, $a0, 0
+; CHECK-NEXT: vxori.b $vr0, $vr0, 255
+; CHECK-NEXT: vclz.b $vr0, $vr0
+; CHECK-NEXT: vst $vr0, $a1, 0
+; CHECK-NEXT: ret
+ %v = load <16 x i8>, ptr %src
+ %neg = xor <16 x i8> %v, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %neg, i1 false)
+ store <16 x i8> %res, ptr %dst
+ ret void
+}
+
+define void @not_ctlz_v8i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v8i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vld $vr0, $a0, 0
+; CHECK-NEXT: vrepli.b $vr1, -1
+; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT: vclz.h $vr0, $vr0
+; CHECK-NEXT: vst $vr0, $a1, 0
+; CHECK-NEXT: ret
+ %v = load <8 x i16>, ptr %src
+ %neg = xor <8 x i16> %v, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+ %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %neg, i1 false)
+ store <8 x i16> %res, ptr %dst
+ ret void
+}
+
+define void @not_ctlz_v4i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vld $vr0, $a0, 0
+; CHECK-NEXT: vrepli.b $vr1, -1
+; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT: vclz.w $vr0, $vr0
+; CHECK-NEXT: vst $vr0, $a1, 0
+; CHECK-NEXT: ret
+ %v = load <4 x i32>, ptr %src
+ %neg = xor <4 x i32> %v, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %neg, i1 false)
+ store <4 x i32> %res, ptr %dst
+ ret void
+}
+
+define void @not_ctlz_v2i64(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v2i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vld $vr0, $a0, 0
+; CHECK-NEXT: vrepli.b $vr1, -1
+; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT: vclz.d $vr0, $vr0
+; CHECK-NEXT: vst $vr0, $a1, 0
+; CHECK-NEXT: ret
+ %v = load <2 x i64>, ptr %src
+ %neg = xor <2 x i64> %v, <i64 -1, i64 -1>
+ %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %neg, i1 false)
+ store <2 x i64> %res, ptr %dst
+ ret void
+}
+
declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>)
declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>)
declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>)
diff --git a/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll b/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll
index 1ca6290..cb01ac0 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll
@@ -7,22 +7,8 @@ define void @ceil_v4f32(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: ceil_v4f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a1, 0
-; CHECK-NEXT: vreplvei.w $vr1, $vr0, 1
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrp.s $vr1, $vr1
-; CHECK-NEXT: vreplvei.w $vr2, $vr0, 0
-; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0
-; CHECK-NEXT: vfrintrp.s $vr2, $vr2
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 16
-; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrp.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 32
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0
; CHECK-NEXT: vfrintrp.s $vr0, $vr0
-; CHECK-NEXT: vextrins.w $vr2, $vr0, 48
-; CHECK-NEXT: vst $vr2, $a0, 0
+; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
%v0 = load <4 x float>, ptr %a0
@@ -36,13 +22,7 @@ define void @ceil_v2f64(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: ceil_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a1, 0
-; CHECK-NEXT: vreplvei.d $vr1, $vr0, 1
-; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrp.d $vr1, $vr1
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
; CHECK-NEXT: vfrintrp.d $vr0, $vr0
-; CHECK-NEXT: vextrins.d $vr0, $vr1, 16
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
@@ -57,22 +37,8 @@ define void @floor_v4f32(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: floor_v4f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a1, 0
-; CHECK-NEXT: vreplvei.w $vr1, $vr0, 1
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrm.s $vr1, $vr1
-; CHECK-NEXT: vreplvei.w $vr2, $vr0, 0
-; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0
-; CHECK-NEXT: vfrintrm.s $vr2, $vr2
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 16
-; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrm.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 32
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0
; CHECK-NEXT: vfrintrm.s $vr0, $vr0
-; CHECK-NEXT: vextrins.w $vr2, $vr0, 48
-; CHECK-NEXT: vst $vr2, $a0, 0
+; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
%v0 = load <4 x float>, ptr %a0
@@ -86,13 +52,7 @@ define void @floor_v2f64(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: floor_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a1, 0
-; CHECK-NEXT: vreplvei.d $vr1, $vr0, 1
-; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrm.d $vr1, $vr1
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
; CHECK-NEXT: vfrintrm.d $vr0, $vr0
-; CHECK-NEXT: vextrins.d $vr0, $vr1, 16
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
@@ -107,22 +67,8 @@ define void @trunc_v4f32(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: trunc_v4f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a1, 0
-; CHECK-NEXT: vreplvei.w $vr1, $vr0, 1
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrz.s $vr1, $vr1
-; CHECK-NEXT: vreplvei.w $vr2, $vr0, 0
-; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0
-; CHECK-NEXT: vfrintrz.s $vr2, $vr2
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 16
-; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrz.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 32
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0
; CHECK-NEXT: vfrintrz.s $vr0, $vr0
-; CHECK-NEXT: vextrins.w $vr2, $vr0, 48
-; CHECK-NEXT: vst $vr2, $a0, 0
+; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
%v0 = load <4 x float>, ptr %a0
@@ -136,13 +82,7 @@ define void @trunc_v2f64(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: trunc_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a1, 0
-; CHECK-NEXT: vreplvei.d $vr1, $vr0, 1
-; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrz.d $vr1, $vr1
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
; CHECK-NEXT: vfrintrz.d $vr0, $vr0
-; CHECK-NEXT: vextrins.d $vr0, $vr1, 16
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
@@ -157,22 +97,8 @@ define void @roundeven_v4f32(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: roundeven_v4f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a1, 0
-; CHECK-NEXT: vreplvei.w $vr1, $vr0, 1
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrne.s $vr1, $vr1
-; CHECK-NEXT: vreplvei.w $vr2, $vr0, 0
-; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0
-; CHECK-NEXT: vfrintrne.s $vr2, $vr2
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 16
-; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrne.s $vr1, $vr1
-; CHECK-NEXT: vextrins.w $vr2, $vr1, 32
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0
; CHECK-NEXT: vfrintrne.s $vr0, $vr0
-; CHECK-NEXT: vextrins.w $vr2, $vr0, 48
-; CHECK-NEXT: vst $vr2, $a0, 0
+; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
%v0 = load <4 x float>, ptr %a0
@@ -186,13 +112,7 @@ define void @roundeven_v2f64(ptr %res, ptr %a0) nounwind {
; CHECK-LABEL: roundeven_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a1, 0
-; CHECK-NEXT: vreplvei.d $vr1, $vr0, 1
-; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT: vfrintrne.d $vr1, $vr1
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
; CHECK-NEXT: vfrintrne.d $vr0, $vr0
-; CHECK-NEXT: vextrins.d $vr0, $vr1, 16
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll b/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll
new file mode 100644
index 0000000..9a806a1
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll
@@ -0,0 +1,758 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx --verify-machineinstrs < %s \
+; RUN: | FileCheck --check-prefix=LA32 %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx --verify-machineinstrs < %s \
+; RUN: | FileCheck --check-prefix=LA64 %s
+
+%struct.S = type { i64, i64, i8 }
+%struct.F = type { float, double, float }
+%struct.V = type { <4 x i32>, <4 x i32>, <16 x i16> }
+
+define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_i64:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: addi.w $sp, $sp, -48
+; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s5, $sp, 16 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s6, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT: move $s0, $a3
+; LA32-NEXT: move $s1, $a2
+; LA32-NEXT: slli.w $a1, $a0, 4
+; LA32-NEXT: alsl.w $a0, $a0, $a1, 3
+; LA32-NEXT: add.w $a0, $a4, $a0
+; LA32-NEXT: sltui $a1, $a3, 1
+; LA32-NEXT: slti $a2, $a3, 0
+; LA32-NEXT: masknez $a2, $a2, $a1
+; LA32-NEXT: sltui $a3, $s1, 1
+; LA32-NEXT: maskeqz $a1, $a3, $a1
+; LA32-NEXT: or $a1, $a1, $a2
+; LA32-NEXT: addi.w $s2, $a0, 8
+; LA32-NEXT: bnez $a1, .LBB0_3
+; LA32-NEXT: # %bb.1: # %for.body.preheader
+; LA32-NEXT: move $fp, $a4
+; LA32-NEXT: move $s4, $zero
+; LA32-NEXT: move $s5, $zero
+; LA32-NEXT: move $s3, $zero
+; LA32-NEXT: move $s6, $zero
+; LA32-NEXT: .p2align 4, , 16
+; LA32-NEXT: .LBB0_2: # %for.body
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: move $a0, $fp
+; LA32-NEXT: bl f
+; LA32-NEXT: ld.w $a0, $s2, 4
+; LA32-NEXT: ld.w $a1, $s2, 0
+; LA32-NEXT: add.w $a0, $a0, $s6
+; LA32-NEXT: add.w $s3, $a1, $s3
+; LA32-NEXT: sltu $a1, $s3, $a1
+; LA32-NEXT: addi.w $s4, $s4, 1
+; LA32-NEXT: sltui $a2, $s4, 1
+; LA32-NEXT: add.w $s5, $s5, $a2
+; LA32-NEXT: xor $a2, $s4, $s1
+; LA32-NEXT: xor $a3, $s5, $s0
+; LA32-NEXT: or $a2, $a2, $a3
+; LA32-NEXT: add.w $s6, $a0, $a1
+; LA32-NEXT: bnez $a2, .LBB0_2
+; LA32-NEXT: b .LBB0_4
+; LA32-NEXT: .LBB0_3:
+; LA32-NEXT: move $s3, $zero
+; LA32-NEXT: move $s6, $zero
+; LA32-NEXT: .LBB0_4: # %for.cond.cleanup
+; LA32-NEXT: st.w $s3, $s2, 0
+; LA32-NEXT: st.w $s6, $s2, 4
+; LA32-NEXT: ld.w $s6, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s5, $sp, 16 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 48
+; LA32-NEXT: ret
+;
+; LA64-LABEL: sink_fold_i64:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: addi.d $sp, $sp, -48
+; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s2, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT: move $s0, $a1
+; LA64-NEXT: slli.d $a1, $a0, 4
+; LA64-NEXT: alsl.d $a0, $a0, $a1, 3
+; LA64-NEXT: add.d $a0, $a2, $a0
+; LA64-NEXT: addi.d $s1, $a0, 8
+; LA64-NEXT: blez $s0, .LBB0_3
+; LA64-NEXT: # %bb.1: # %for.body.preheader
+; LA64-NEXT: move $fp, $a2
+; LA64-NEXT: move $s2, $zero
+; LA64-NEXT: .p2align 4, , 16
+; LA64-NEXT: .LBB0_2: # %for.body
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: move $a0, $fp
+; LA64-NEXT: pcaddu18i $ra, %call36(f)
+; LA64-NEXT: jirl $ra, $ra, 0
+; LA64-NEXT: ld.d $a0, $s1, 0
+; LA64-NEXT: addi.d $s0, $s0, -1
+; LA64-NEXT: add.d $s2, $a0, $s2
+; LA64-NEXT: bnez $s0, .LBB0_2
+; LA64-NEXT: b .LBB0_4
+; LA64-NEXT: .LBB0_3:
+; LA64-NEXT: move $s2, $zero
+; LA64-NEXT: .LBB0_4: # %for.cond.cleanup
+; LA64-NEXT: st.d $s2, $s1, 0
+; LA64-NEXT: ld.d $s2, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 48
+; LA64-NEXT: ret
+entry:
+ %y = getelementptr inbounds %struct.S, ptr %a, i64 %k, i32 1
+ %cmp4 = icmp sgt i64 %n, 0
+ br i1 %cmp4, label %for.body, label %for.cond.cleanup
+
+for.body: ; preds = %entry, %for.body
+ %i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+ %s.05 = phi i64 [ 0, %entry ], [ %add, %for.body ]
+ call void @f(ptr %a)
+ %0 = load i64, ptr %y
+ %add = add nsw i64 %0, %s.05
+ %inc = add nuw nsw i64 %i.06, 1
+ %exitcond.not = icmp eq i64 %inc, %n
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %s.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
+ store i64 %s.0.lcssa, ptr %y
+ ret void
+}
+
+define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_f32:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: addi.w $sp, $sp, -48
+; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill
+; LA32-NEXT: fst.d $fs0, $sp, 8 # 8-byte Folded Spill
+; LA32-NEXT: move $s0, $a3
+; LA32-NEXT: move $s1, $a2
+; LA32-NEXT: slli.w $a1, $a0, 4
+; LA32-NEXT: alsl.w $a0, $a0, $a1, 3
+; LA32-NEXT: add.w $a0, $a4, $a0
+; LA32-NEXT: sltui $a1, $a3, 1
+; LA32-NEXT: slti $a2, $a3, 0
+; LA32-NEXT: masknez $a2, $a2, $a1
+; LA32-NEXT: sltui $a3, $s1, 1
+; LA32-NEXT: maskeqz $a1, $a3, $a1
+; LA32-NEXT: or $a1, $a1, $a2
+; LA32-NEXT: addi.w $s2, $a0, 16
+; LA32-NEXT: bnez $a1, .LBB1_3
+; LA32-NEXT: # %bb.1: # %for.body.preheader
+; LA32-NEXT: move $fp, $a4
+; LA32-NEXT: move $s3, $zero
+; LA32-NEXT: move $s4, $zero
+; LA32-NEXT: movgr2fr.w $fs0, $zero
+; LA32-NEXT: .p2align 4, , 16
+; LA32-NEXT: .LBB1_2: # %for.body
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: move $a0, $fp
+; LA32-NEXT: bl f
+; LA32-NEXT: fld.s $fa0, $s2, 0
+; LA32-NEXT: addi.w $s3, $s3, 1
+; LA32-NEXT: sltui $a0, $s3, 1
+; LA32-NEXT: add.w $s4, $s4, $a0
+; LA32-NEXT: xor $a0, $s3, $s1
+; LA32-NEXT: xor $a1, $s4, $s0
+; LA32-NEXT: or $a0, $a0, $a1
+; LA32-NEXT: fadd.s $fs0, $fa0, $fs0
+; LA32-NEXT: bnez $a0, .LBB1_2
+; LA32-NEXT: b .LBB1_4
+; LA32-NEXT: .LBB1_3:
+; LA32-NEXT: movgr2fr.w $fs0, $zero
+; LA32-NEXT: .LBB1_4: # %for.cond.cleanup
+; LA32-NEXT: fst.s $fs0, $s2, 0
+; LA32-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload
+; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 48
+; LA32-NEXT: ret
+;
+; LA64-LABEL: sink_fold_f32:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: addi.d $sp, $sp, -48
+; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT: fst.d $fs0, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT: move $s0, $a1
+; LA64-NEXT: slli.d $a1, $a0, 4
+; LA64-NEXT: alsl.d $a0, $a0, $a1, 3
+; LA64-NEXT: add.d $a0, $a2, $a0
+; LA64-NEXT: addi.d $s1, $a0, 16
+; LA64-NEXT: blez $s0, .LBB1_3
+; LA64-NEXT: # %bb.1: # %for.body.preheader
+; LA64-NEXT: move $fp, $a2
+; LA64-NEXT: movgr2fr.w $fs0, $zero
+; LA64-NEXT: .p2align 4, , 16
+; LA64-NEXT: .LBB1_2: # %for.body
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: move $a0, $fp
+; LA64-NEXT: pcaddu18i $ra, %call36(f)
+; LA64-NEXT: jirl $ra, $ra, 0
+; LA64-NEXT: fld.s $fa0, $s1, 0
+; LA64-NEXT: addi.d $s0, $s0, -1
+; LA64-NEXT: fadd.s $fs0, $fa0, $fs0
+; LA64-NEXT: bnez $s0, .LBB1_2
+; LA64-NEXT: b .LBB1_4
+; LA64-NEXT: .LBB1_3:
+; LA64-NEXT: movgr2fr.w $fs0, $zero
+; LA64-NEXT: .LBB1_4: # %for.cond.cleanup
+; LA64-NEXT: fst.s $fs0, $s1, 0
+; LA64-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 48
+; LA64-NEXT: ret
+entry:
+ %y = getelementptr inbounds %struct.F, ptr %a, i64 %k, i32 2
+ %cmp4 = icmp sgt i64 %n, 0
+ br i1 %cmp4, label %for.body, label %for.cond.cleanup
+
+for.body: ; preds = %entry, %for.body
+ %i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+ %s.05 = phi float [ 0.0, %entry ], [ %add, %for.body ]
+ call void @f(ptr %a)
+ %0 = load float, ptr %y
+ %add = fadd float %0, %s.05
+ %inc = add nuw nsw i64 %i.06, 1
+ %exitcond.not = icmp eq i64 %inc, %n
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %s.0.lcssa = phi float [ 0.0, %entry ], [ %add, %for.body ]
+ store float %s.0.lcssa, ptr %y
+ ret void
+}
+
+define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_v4i32:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: addi.w $sp, $sp, -48
+; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill
+; LA32-NEXT: move $s0, $a3
+; LA32-NEXT: move $s1, $a2
+; LA32-NEXT: slli.w $a0, $a0, 6
+; LA32-NEXT: add.w $a0, $a4, $a0
+; LA32-NEXT: sltui $a1, $a3, 1
+; LA32-NEXT: slti $a2, $a3, 0
+; LA32-NEXT: masknez $a2, $a2, $a1
+; LA32-NEXT: sltui $a3, $s1, 1
+; LA32-NEXT: maskeqz $a1, $a3, $a1
+; LA32-NEXT: or $a1, $a1, $a2
+; LA32-NEXT: addi.w $s2, $a0, 16
+; LA32-NEXT: bnez $a1, .LBB2_3
+; LA32-NEXT: # %bb.1: # %for.body.preheader
+; LA32-NEXT: move $fp, $a4
+; LA32-NEXT: move $s3, $zero
+; LA32-NEXT: move $s4, $zero
+; LA32-NEXT: vrepli.b $vr0, 0
+; LA32-NEXT: .p2align 4, , 16
+; LA32-NEXT: .LBB2_2: # %for.body
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA32-NEXT: move $a0, $fp
+; LA32-NEXT: bl f
+; LA32-NEXT: vld $vr0, $s2, 0
+; LA32-NEXT: addi.w $s3, $s3, 1
+; LA32-NEXT: sltui $a0, $s3, 1
+; LA32-NEXT: add.w $s4, $s4, $a0
+; LA32-NEXT: xor $a0, $s3, $s1
+; LA32-NEXT: xor $a1, $s4, $s0
+; LA32-NEXT: or $a0, $a0, $a1
+; LA32-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload
+; LA32-NEXT: vadd.w $vr1, $vr0, $vr1
+; LA32-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill
+; LA32-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA32-NEXT: bnez $a0, .LBB2_2
+; LA32-NEXT: b .LBB2_4
+; LA32-NEXT: .LBB2_3:
+; LA32-NEXT: vrepli.b $vr0, 0
+; LA32-NEXT: .LBB2_4: # %for.cond.cleanup
+; LA32-NEXT: vst $vr0, $s2, 0
+; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 48
+; LA32-NEXT: ret
+;
+; LA64-LABEL: sink_fold_v4i32:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: addi.d $sp, $sp, -48
+; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT: slli.d $a0, $a0, 6
+; LA64-NEXT: add.d $a0, $a2, $a0
+; LA64-NEXT: addi.d $s1, $a0, 16
+; LA64-NEXT: blez $a1, .LBB2_3
+; LA64-NEXT: # %bb.1: # %for.body.preheader
+; LA64-NEXT: move $fp, $a2
+; LA64-NEXT: move $s0, $a1
+; LA64-NEXT: vrepli.b $vr0, 0
+; LA64-NEXT: .p2align 4, , 16
+; LA64-NEXT: .LBB2_2: # %for.body
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT: move $a0, $fp
+; LA64-NEXT: pcaddu18i $ra, %call36(f)
+; LA64-NEXT: jirl $ra, $ra, 0
+; LA64-NEXT: vld $vr0, $s1, 0
+; LA64-NEXT: addi.d $s0, $s0, -1
+; LA64-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT: vadd.w $vr1, $vr0, $vr1
+; LA64-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT: bnez $s0, .LBB2_2
+; LA64-NEXT: b .LBB2_4
+; LA64-NEXT: .LBB2_3:
+; LA64-NEXT: vrepli.b $vr0, 0
+; LA64-NEXT: .LBB2_4: # %for.cond.cleanup
+; LA64-NEXT: vst $vr0, $s1, 0
+; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 48
+; LA64-NEXT: ret
+entry:
+ %y = getelementptr inbounds %struct.V, ptr %a, i64 %k, i32 1
+ %cmp = icmp sgt i64 %n, 0
+ br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body: ; preds = %entry, %for.body
+ %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+ %sum.0 = phi <4 x i32> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+ call void @f(ptr %a)
+ %v = load <4 x i32>, ptr %y
+ %addv = add <4 x i32> %v, %sum.0
+ %inc = add nuw nsw i64 %i.0, 1
+ %exitcond = icmp eq i64 %inc, %n
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %sum.lcssa = phi <4 x i32> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+ store <4 x i32> %sum.lcssa, ptr %y
+ ret void
+}
+
+define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_v16i16:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: addi.w $sp, $sp, -80
+; LA32-NEXT: st.w $ra, $sp, 76 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 72 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s0, $sp, 68 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s1, $sp, 64 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s2, $sp, 60 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s3, $sp, 56 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s4, $sp, 52 # 4-byte Folded Spill
+; LA32-NEXT: move $s0, $a3
+; LA32-NEXT: move $s1, $a2
+; LA32-NEXT: slli.w $a0, $a0, 6
+; LA32-NEXT: add.w $a0, $a4, $a0
+; LA32-NEXT: sltui $a1, $a3, 1
+; LA32-NEXT: slti $a2, $a3, 0
+; LA32-NEXT: masknez $a2, $a2, $a1
+; LA32-NEXT: sltui $a3, $s1, 1
+; LA32-NEXT: maskeqz $a1, $a3, $a1
+; LA32-NEXT: or $a1, $a1, $a2
+; LA32-NEXT: addi.w $s2, $a0, 32
+; LA32-NEXT: bnez $a1, .LBB3_3
+; LA32-NEXT: # %bb.1: # %for.body.preheader
+; LA32-NEXT: move $fp, $a4
+; LA32-NEXT: move $s3, $zero
+; LA32-NEXT: move $s4, $zero
+; LA32-NEXT: xvrepli.b $xr0, 0
+; LA32-NEXT: .p2align 4, , 16
+; LA32-NEXT: .LBB3_2: # %for.body
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; LA32-NEXT: move $a0, $fp
+; LA32-NEXT: bl f
+; LA32-NEXT: xvld $xr0, $s2, 0
+; LA32-NEXT: addi.w $s3, $s3, 1
+; LA32-NEXT: sltui $a0, $s3, 1
+; LA32-NEXT: add.w $s4, $s4, $a0
+; LA32-NEXT: xor $a0, $s3, $s1
+; LA32-NEXT: xor $a1, $s4, $s0
+; LA32-NEXT: or $a0, $a0, $a1
+; LA32-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; LA32-NEXT: xvadd.h $xr1, $xr0, $xr1
+; LA32-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill
+; LA32-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; LA32-NEXT: bnez $a0, .LBB3_2
+; LA32-NEXT: b .LBB3_4
+; LA32-NEXT: .LBB3_3:
+; LA32-NEXT: xvrepli.b $xr0, 0
+; LA32-NEXT: .LBB3_4: # %for.cond.cleanup
+; LA32-NEXT: xvst $xr0, $s2, 0
+; LA32-NEXT: ld.w $s4, $sp, 52 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s3, $sp, 56 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s2, $sp, 60 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s1, $sp, 64 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s0, $sp, 68 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $fp, $sp, 72 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 76 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 80
+; LA32-NEXT: ret
+;
+; LA64-LABEL: sink_fold_v16i16:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: addi.d $sp, $sp, -80
+; LA64-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 64 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s0, $sp, 56 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s1, $sp, 48 # 8-byte Folded Spill
+; LA64-NEXT: slli.d $a0, $a0, 6
+; LA64-NEXT: add.d $a0, $a2, $a0
+; LA64-NEXT: addi.d $s1, $a0, 32
+; LA64-NEXT: blez $a1, .LBB3_3
+; LA64-NEXT: # %bb.1: # %for.body.preheader
+; LA64-NEXT: move $fp, $a2
+; LA64-NEXT: move $s0, $a1
+; LA64-NEXT: xvrepli.b $xr0, 0
+; LA64-NEXT: .p2align 4, , 16
+; LA64-NEXT: .LBB3_2: # %for.body
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; LA64-NEXT: move $a0, $fp
+; LA64-NEXT: pcaddu18i $ra, %call36(f)
+; LA64-NEXT: jirl $ra, $ra, 0
+; LA64-NEXT: xvld $xr0, $s1, 0
+; LA64-NEXT: addi.d $s0, $s0, -1
+; LA64-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; LA64-NEXT: xvadd.h $xr1, $xr0, $xr1
+; LA64-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill
+; LA64-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; LA64-NEXT: bnez $s0, .LBB3_2
+; LA64-NEXT: b .LBB3_4
+; LA64-NEXT: .LBB3_3:
+; LA64-NEXT: xvrepli.b $xr0, 0
+; LA64-NEXT: .LBB3_4: # %for.cond.cleanup
+; LA64-NEXT: xvst $xr0, $s1, 0
+; LA64-NEXT: ld.d $s1, $sp, 48 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $s0, $sp, 56 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $fp, $sp, 64 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 80
+; LA64-NEXT: ret
+entry:
+ %y = getelementptr inbounds %struct.V, ptr %a, i64 %k, i32 2
+ %cmp = icmp sgt i64 %n, 0
+ br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body: ; preds = %entry, %for.body
+ %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+ %sum.0 = phi <16 x i16> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+ call void @f(ptr %a)
+ %v = load <16 x i16>, ptr %y
+ %addv = add <16 x i16> %v, %sum.0
+ %inc = add nuw nsw i64 %i.0, 1
+ %exitcond = icmp eq i64 %inc, %n
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %sum.lcssa = phi <16 x i16> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+ store <16 x i16> %sum.lcssa, ptr %y
+ ret void
+}
+
+define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_extracti8:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: addi.w $sp, $sp, -48
+; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill
+; LA32-NEXT: move $s0, $a3
+; LA32-NEXT: move $s1, $a2
+; LA32-NEXT: slli.w $a1, $a0, 4
+; LA32-NEXT: alsl.w $a0, $a0, $a1, 3
+; LA32-NEXT: add.w $a0, $a4, $a0
+; LA32-NEXT: sltui $a1, $a3, 1
+; LA32-NEXT: slti $a2, $a3, 0
+; LA32-NEXT: masknez $a2, $a2, $a1
+; LA32-NEXT: sltui $a3, $s1, 1
+; LA32-NEXT: maskeqz $a1, $a3, $a1
+; LA32-NEXT: or $a1, $a1, $a2
+; LA32-NEXT: addi.w $s2, $a0, 16
+; LA32-NEXT: bnez $a1, .LBB4_3
+; LA32-NEXT: # %bb.1: # %for.body.preheader
+; LA32-NEXT: move $fp, $a4
+; LA32-NEXT: move $s3, $zero
+; LA32-NEXT: move $s4, $zero
+; LA32-NEXT: vrepli.b $vr0, 0
+; LA32-NEXT: .p2align 4, , 16
+; LA32-NEXT: .LBB4_2: # %for.body
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA32-NEXT: move $a0, $fp
+; LA32-NEXT: bl f
+; LA32-NEXT: vldrepl.b $vr0, $s2, 0
+; LA32-NEXT: addi.w $s3, $s3, 1
+; LA32-NEXT: sltui $a0, $s3, 1
+; LA32-NEXT: add.w $s4, $s4, $a0
+; LA32-NEXT: xor $a0, $s3, $s1
+; LA32-NEXT: xor $a1, $s4, $s0
+; LA32-NEXT: or $a0, $a0, $a1
+; LA32-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload
+; LA32-NEXT: vadd.b $vr1, $vr0, $vr1
+; LA32-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill
+; LA32-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA32-NEXT: bnez $a0, .LBB4_2
+; LA32-NEXT: b .LBB4_4
+; LA32-NEXT: .LBB4_3:
+; LA32-NEXT: vrepli.b $vr0, 0
+; LA32-NEXT: .LBB4_4: # %for.cond.cleanup
+; LA32-NEXT: vstelm.b $vr0, $s2, 0, 1
+; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 48
+; LA32-NEXT: ret
+;
+; LA64-LABEL: sink_fold_extracti8:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: addi.d $sp, $sp, -48
+; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT: move $s0, $a1
+; LA64-NEXT: slli.d $a1, $a0, 4
+; LA64-NEXT: alsl.d $a0, $a0, $a1, 3
+; LA64-NEXT: add.d $a0, $a2, $a0
+; LA64-NEXT: addi.d $s1, $a0, 16
+; LA64-NEXT: blez $s0, .LBB4_3
+; LA64-NEXT: # %bb.1: # %for.body.preheader
+; LA64-NEXT: move $fp, $a2
+; LA64-NEXT: vrepli.b $vr0, 0
+; LA64-NEXT: .p2align 4, , 16
+; LA64-NEXT: .LBB4_2: # %for.body
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT: move $a0, $fp
+; LA64-NEXT: pcaddu18i $ra, %call36(f)
+; LA64-NEXT: jirl $ra, $ra, 0
+; LA64-NEXT: vldrepl.b $vr0, $s1, 0
+; LA64-NEXT: addi.d $s0, $s0, -1
+; LA64-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT: vadd.b $vr1, $vr0, $vr1
+; LA64-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT: bnez $s0, .LBB4_2
+; LA64-NEXT: b .LBB4_4
+; LA64-NEXT: .LBB4_3:
+; LA64-NEXT: vrepli.b $vr0, 0
+; LA64-NEXT: .LBB4_4: # %for.cond.cleanup
+; LA64-NEXT: vstelm.b $vr0, $s1, 0, 1
+; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 48
+; LA64-NEXT: ret
+entry:
+ %y = getelementptr inbounds %struct.S, ptr %a, i64 %k, i32 2
+ %cmp = icmp sgt i64 %n, 0
+ br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body: ; preds = %entry, %for.body
+ %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+ %sum.0 = phi <16 x i8> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+ call void @f(ptr %a)
+ %e = load i8, ptr %y
+ %ins0 = insertelement <16 x i8> poison, i8 %e, i32 0
+ %v = shufflevector <16 x i8> %ins0, <16 x i8> poison, <16 x i32> zeroinitializer
+ %addv = add <16 x i8> %v, %sum.0
+ %inc = add nuw nsw i64 %i.0, 1
+ %exitcond = icmp eq i64 %inc, %n
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %sum.lcssa = phi <16 x i8> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+ %res = extractelement <16 x i8> %sum.lcssa, i32 1
+ store i8 %res, ptr %y
+ ret void
+}
+
+define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_extractf64:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: addi.w $sp, $sp, -80
+; LA32-NEXT: st.w $ra, $sp, 76 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 72 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s0, $sp, 68 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s1, $sp, 64 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s2, $sp, 60 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s3, $sp, 56 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s4, $sp, 52 # 4-byte Folded Spill
+; LA32-NEXT: move $s0, $a3
+; LA32-NEXT: move $s1, $a2
+; LA32-NEXT: slli.w $a1, $a0, 4
+; LA32-NEXT: alsl.w $a0, $a0, $a1, 3
+; LA32-NEXT: add.w $a0, $a4, $a0
+; LA32-NEXT: sltui $a1, $a3, 1
+; LA32-NEXT: slti $a2, $a3, 0
+; LA32-NEXT: masknez $a2, $a2, $a1
+; LA32-NEXT: sltui $a3, $s1, 1
+; LA32-NEXT: maskeqz $a1, $a3, $a1
+; LA32-NEXT: or $a1, $a1, $a2
+; LA32-NEXT: addi.w $s2, $a0, 8
+; LA32-NEXT: bnez $a1, .LBB5_3
+; LA32-NEXT: # %bb.1: # %for.body.preheader
+; LA32-NEXT: move $fp, $a4
+; LA32-NEXT: move $s3, $zero
+; LA32-NEXT: move $s4, $zero
+; LA32-NEXT: xvrepli.b $xr0, 0
+; LA32-NEXT: .p2align 4, , 16
+; LA32-NEXT: .LBB5_2: # %for.body
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; LA32-NEXT: move $a0, $fp
+; LA32-NEXT: bl f
+; LA32-NEXT: xvldrepl.d $xr0, $s2, 0
+; LA32-NEXT: addi.w $s3, $s3, 1
+; LA32-NEXT: sltui $a0, $s3, 1
+; LA32-NEXT: add.w $s4, $s4, $a0
+; LA32-NEXT: xor $a0, $s3, $s1
+; LA32-NEXT: xor $a1, $s4, $s0
+; LA32-NEXT: or $a0, $a0, $a1
+; LA32-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; LA32-NEXT: xvfadd.d $xr1, $xr0, $xr1
+; LA32-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill
+; LA32-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; LA32-NEXT: bnez $a0, .LBB5_2
+; LA32-NEXT: b .LBB5_4
+; LA32-NEXT: .LBB5_3:
+; LA32-NEXT: xvrepli.b $xr0, 0
+; LA32-NEXT: .LBB5_4: # %for.cond.cleanup
+; LA32-NEXT: xvstelm.d $xr0, $s2, 0, 1
+; LA32-NEXT: ld.w $s4, $sp, 52 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s3, $sp, 56 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s2, $sp, 60 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s1, $sp, 64 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s0, $sp, 68 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $fp, $sp, 72 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 76 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 80
+; LA32-NEXT: ret
+;
+; LA64-LABEL: sink_fold_extractf64:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: addi.d $sp, $sp, -80
+; LA64-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 64 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s0, $sp, 56 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s1, $sp, 48 # 8-byte Folded Spill
+; LA64-NEXT: move $s0, $a1
+; LA64-NEXT: slli.d $a1, $a0, 4
+; LA64-NEXT: alsl.d $a0, $a0, $a1, 3
+; LA64-NEXT: add.d $a0, $a2, $a0
+; LA64-NEXT: addi.d $s1, $a0, 8
+; LA64-NEXT: blez $s0, .LBB5_3
+; LA64-NEXT: # %bb.1: # %for.body.preheader
+; LA64-NEXT: move $fp, $a2
+; LA64-NEXT: xvrepli.b $xr0, 0
+; LA64-NEXT: .p2align 4, , 16
+; LA64-NEXT: .LBB5_2: # %for.body
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; LA64-NEXT: move $a0, $fp
+; LA64-NEXT: pcaddu18i $ra, %call36(f)
+; LA64-NEXT: jirl $ra, $ra, 0
+; LA64-NEXT: xvldrepl.d $xr0, $s1, 0
+; LA64-NEXT: addi.d $s0, $s0, -1
+; LA64-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; LA64-NEXT: xvfadd.d $xr1, $xr0, $xr1
+; LA64-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill
+; LA64-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; LA64-NEXT: bnez $s0, .LBB5_2
+; LA64-NEXT: b .LBB5_4
+; LA64-NEXT: .LBB5_3:
+; LA64-NEXT: xvrepli.b $xr0, 0
+; LA64-NEXT: .LBB5_4: # %for.cond.cleanup
+; LA64-NEXT: xvstelm.d $xr0, $s1, 0, 1
+; LA64-NEXT: ld.d $s1, $sp, 48 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $s0, $sp, 56 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $fp, $sp, 64 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 80
+; LA64-NEXT: ret
+entry:
+ %y = getelementptr inbounds %struct.F, ptr %a, i64 %k, i32 1
+ %cmp = icmp sgt i64 %n, 0
+ br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body: ; preds = %entry, %for.body
+ %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+ %sum.0 = phi <4 x double> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+ call void @f(ptr %a)
+ %e = load double, ptr %y
+ %ins0 = insertelement <4 x double> poison, double %e, i32 0
+ %v = shufflevector <4 x double> %ins0, <4 x double> poison, <4 x i32> zeroinitializer
+ %addv = fadd <4 x double> %v, %sum.0
+ %inc = add nuw nsw i64 %i.0, 1
+ %exitcond = icmp eq i64 %inc, %n
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %sum.lcssa = phi <4 x double> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+ %res = extractelement <4 x double> %sum.lcssa, i32 1
+ store double %res, ptr %y
+ ret void
+}
+
+declare void @f(ptr)
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
index d3c0da9..000c67ef 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
@@ -1439,11 +1439,8 @@ Key: PSUBWrm: [ 0.00 0.00 ]
Key: PSUBWrr: [ 0.00 0.00 ]
Key: PSWAPDrm: [ 0.00 0.00 ]
Key: PSWAPDrr: [ 0.00 0.00 ]
-Key: PT: [ 0.00 0.00 ]
Key: PTCMMIMFP: [ 0.00 0.00 ]
Key: PTCMMRLFP: [ 0.00 0.00 ]
-Key: PTCONJTCMMIMFP: [ 0.00 0.00 ]
-Key: PTCONJTFP: [ 0.00 0.00 ]
Key: PTCVTROWD: [ 0.00 0.00 ]
Key: PTCVTROWPS: [ 0.00 0.00 ]
Key: PTDPBF: [ 0.00 0.00 ]
@@ -1471,20 +1468,11 @@ Key: PTILEMOVROWrre: [ 0.00 0.00 ]
Key: PTILEMOVROWrreV: [ 0.00 0.00 ]
Key: PTILEMOVROWrri: [ 0.00 0.00 ]
Key: PTILEMOVROWrriV: [ 0.00 0.00 ]
-Key: PTILEPAIRLOAD: [ 0.00 0.00 ]
-Key: PTILEPAIRSTORE: [ 0.00 0.00 ]
Key: PTILESTORED: [ 0.00 0.00 ]
Key: PTILESTOREDV: [ 0.00 0.00 ]
Key: PTILEZERO: [ 0.00 0.00 ]
Key: PTILEZEROV: [ 0.00 0.00 ]
Key: PTMMULTF: [ 0.00 0.00 ]
-Key: PTTCMMIMFP: [ 0.00 0.00 ]
-Key: PTTCMMRLFP: [ 0.00 0.00 ]
-Key: PTTDPBF: [ 0.00 0.00 ]
-Key: PTTDPFP: [ 0.00 0.00 ]
-Key: PTTMMULTF: [ 0.00 0.00 ]
-Key: PTTRANSPOSED: [ 0.00 0.00 ]
-Key: PTTRANSPOSEDV: [ 0.00 0.00 ]
Key: PTWRITE: [ 0.00 0.00 ]
Key: PTWRITEm: [ 0.00 0.00 ]
Key: PTWRITEr: [ 0.00 0.00 ]
@@ -1717,8 +1705,6 @@ Key: TAILJMPm: [ 0.00 0.00 ]
Key: TAILJMPr: [ 0.00 0.00 ]
Key: TCMMIMFP: [ 0.00 0.00 ]
Key: TCMMRLFP: [ 0.00 0.00 ]
-Key: TCONJTCMMIMFP: [ 0.00 0.00 ]
-Key: TCONJTFP: [ 0.00 0.00 ]
Key: TCRETURN_HIPE: [ 0.00 0.00 ]
Key: TCRETURN_WIN: [ 0.00 0.00 ]
Key: TCRETURN_WINmi: [ 0.00 0.00 ]
@@ -1764,12 +1750,6 @@ Key: TPAUSE: [ 0.00 0.00 ]
Key: TRAP: [ 0.00 0.00 ]
Key: TST_F: [ 0.00 0.00 ]
Key: TST_Fp: [ 0.00 0.00 ]
-Key: TTCMMIMFP: [ 0.00 0.00 ]
-Key: TTCMMRLFP: [ 0.00 0.00 ]
-Key: TTDPBF: [ 0.00 0.00 ]
-Key: TTDPFP: [ 0.00 0.00 ]
-Key: TTMMULTF: [ 0.00 0.00 ]
-Key: TTRANSPOSED: [ 0.00 0.00 ]
Key: TZCNT: [ 0.00 0.00 ]
Key: TZMSK: [ 0.00 0.00 ]
Key: UBSAN_UD: [ 0.00 0.00 ]
@@ -7034,7 +7014,6 @@ Key: PhyReg_VR256: [ 0.00 0.00 ]
Key: PhyReg_VR512: [ 0.00 0.00 ]
Key: PhyReg_VR512_0_15: [ 0.00 0.00 ]
Key: PhyReg_TILE: [ 0.00 0.00 ]
-Key: PhyReg_TILEPAIR: [ 0.00 0.00 ]
Key: VirtReg_GR8: [ 0.00 0.00 ]
Key: VirtReg_GRH8: [ 0.00 0.00 ]
Key: VirtReg_GR8_NOREX2: [ 0.00 0.00 ]
@@ -7170,4 +7149,3 @@ Key: VirtReg_VR256: [ 0.00 0.00 ]
Key: VirtReg_VR512: [ 0.00 0.00 ]
Key: VirtReg_VR512_0_15: [ 0.00 0.00 ]
Key: VirtReg_TILE: [ 0.00 0.00 ]
-Key: VirtReg_TILEPAIR: [ 0.00 0.00 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
index c6e5508..bb72886 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
@@ -1439,11 +1439,8 @@ Key: PSUBWrm: [ 0.00 0.00 ]
Key: PSUBWrr: [ 0.00 0.00 ]
Key: PSWAPDrm: [ 0.00 0.00 ]
Key: PSWAPDrr: [ 0.00 0.00 ]
-Key: PT: [ 0.00 0.00 ]
Key: PTCMMIMFP: [ 0.00 0.00 ]
Key: PTCMMRLFP: [ 0.00 0.00 ]
-Key: PTCONJTCMMIMFP: [ 0.00 0.00 ]
-Key: PTCONJTFP: [ 0.00 0.00 ]
Key: PTCVTROWD: [ 0.00 0.00 ]
Key: PTCVTROWPS: [ 0.00 0.00 ]
Key: PTDPBF: [ 0.00 0.00 ]
@@ -1471,20 +1468,11 @@ Key: PTILEMOVROWrre: [ 0.00 0.00 ]
Key: PTILEMOVROWrreV: [ 0.00 0.00 ]
Key: PTILEMOVROWrri: [ 0.00 0.00 ]
Key: PTILEMOVROWrriV: [ 0.00 0.00 ]
-Key: PTILEPAIRLOAD: [ 0.00 0.00 ]
-Key: PTILEPAIRSTORE: [ 0.00 0.00 ]
Key: PTILESTORED: [ 0.00 0.00 ]
Key: PTILESTOREDV: [ 0.00 0.00 ]
Key: PTILEZERO: [ 0.00 0.00 ]
Key: PTILEZEROV: [ 0.00 0.00 ]
Key: PTMMULTF: [ 0.00 0.00 ]
-Key: PTTCMMIMFP: [ 0.00 0.00 ]
-Key: PTTCMMRLFP: [ 0.00 0.00 ]
-Key: PTTDPBF: [ 0.00 0.00 ]
-Key: PTTDPFP: [ 0.00 0.00 ]
-Key: PTTMMULTF: [ 0.00 0.00 ]
-Key: PTTRANSPOSED: [ 0.00 0.00 ]
-Key: PTTRANSPOSEDV: [ 0.00 0.00 ]
Key: PTWRITE: [ 0.00 0.00 ]
Key: PTWRITEm: [ 0.00 0.00 ]
Key: PTWRITEr: [ 0.00 0.00 ]
@@ -1717,8 +1705,6 @@ Key: TAILJMPm: [ 0.00 0.00 ]
Key: TAILJMPr: [ 0.00 0.00 ]
Key: TCMMIMFP: [ 0.00 0.00 ]
Key: TCMMRLFP: [ 0.00 0.00 ]
-Key: TCONJTCMMIMFP: [ 0.00 0.00 ]
-Key: TCONJTFP: [ 0.00 0.00 ]
Key: TCRETURN_HIPE: [ 0.00 0.00 ]
Key: TCRETURN_WIN: [ 0.00 0.00 ]
Key: TCRETURN_WINmi: [ 0.00 0.00 ]
@@ -1764,12 +1750,6 @@ Key: TPAUSE: [ 0.00 0.00 ]
Key: TRAP: [ 0.00 0.00 ]
Key: TST_F: [ 0.00 0.00 ]
Key: TST_Fp: [ 0.00 0.00 ]
-Key: TTCMMIMFP: [ 0.00 0.00 ]
-Key: TTCMMRLFP: [ 0.00 0.00 ]
-Key: TTDPBF: [ 0.00 0.00 ]
-Key: TTDPFP: [ 0.00 0.00 ]
-Key: TTMMULTF: [ 0.00 0.00 ]
-Key: TTRANSPOSED: [ 0.00 0.00 ]
Key: TZCNT: [ 0.00 0.00 ]
Key: TZMSK: [ 0.00 0.00 ]
Key: UBSAN_UD: [ 0.00 0.00 ]
@@ -7034,7 +7014,6 @@ Key: PhyReg_VR256: [ 0.00 0.00 ]
Key: PhyReg_VR512: [ 0.00 0.00 ]
Key: PhyReg_VR512_0_15: [ 0.00 0.00 ]
Key: PhyReg_TILE: [ 0.00 0.00 ]
-Key: PhyReg_TILEPAIR: [ 0.00 0.00 ]
Key: VirtReg_GR8: [ 0.00 0.00 ]
Key: VirtReg_GRH8: [ 0.00 0.00 ]
Key: VirtReg_GR8_NOREX2: [ 0.00 0.00 ]
@@ -7170,4 +7149,3 @@ Key: VirtReg_VR256: [ 0.00 0.00 ]
Key: VirtReg_VR512: [ 0.00 0.00 ]
Key: VirtReg_VR512_0_15: [ 0.00 0.00 ]
Key: VirtReg_TILE: [ 0.00 0.00 ]
-Key: VirtReg_TILEPAIR: [ 0.00 0.00 ]
diff --git a/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll b/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll
index bd8d882..9dd402d 100644
--- a/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll
+++ b/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll
@@ -26,7 +26,7 @@
; Also, the first eviction problem is significantly less than 300 instructions. Check
; that there is a zero value.
; Note: we're regex-ing some of the opcodes to avoid test flakyness.
-; CHECK: instructions: 20,{{([0-9]{4})}},1{{([0-9]{3})}},2{{([0-9]{3})}},{{.*}},0,
+; CHECK: instructions: 20,{{([0-9]{4})}},{{([0-9]{4})}},{{([0-9]{4})}},{{.*}},0,
; Only the candidate virtreg and the 10th LR are included in this problem. Make
; sure the other LRs have values of zero. There are 2700 0s followed by some 1s.
; There's a limit to how many repetitions can be matched.
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll
index b5c43fd2..d653895 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll
@@ -1,8 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
target triple = "nvptx64-nvidia-cuda"
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll
index 57342dc..5de1ac8 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll
@@ -1,8 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
target triple = "nvptx64-nvidia-cuda"
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll
index 6296d5a..2f5c1ef 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll
@@ -1,8 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
target triple = "nvptx64-nvidia-cuda"
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll
index e5ae387..a2b2c2f 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll
@@ -1,8 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
target triple = "nvptx64-nvidia-cuda"
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll
index 7d04ada..e4c48dd 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll
@@ -1,8 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
target triple = "nvptx64-nvidia-cuda"
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll
index b0fe77c..727bb3b 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll
@@ -1,8 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %}
; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
target triple = "nvptx64-nvidia-cuda"
@@ -29,10 +33,10 @@ define void @cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_1];
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_1d_param_2];
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_1d_param_3];
-; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1}], [%rd2];
+; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4];
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_1d_param_5];
+; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1}], [%rd2];
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rd4;
-; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4];
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1}], [%rd2], %rs1;
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rs1, %rd4;
; CHECK-PTX64-NEXT: ret;
@@ -48,10 +52,10 @@ define void @cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_1d_param_1];
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_1d_param_2];
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_1d_param_3];
-; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3}], [%r2];
+; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4];
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_5];
+; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3}], [%r2];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rd2;
-; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3}], [%r2], %rs1;
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rs1, %rd2;
; CHECK-PTX-SHARED32-NEXT: ret;
@@ -79,10 +83,10 @@ define void @cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_2d_param_2];
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_2d_param_3];
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_2d_param_4];
-; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2}], [%rd2];
+; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5];
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_2d_param_6];
+; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2}], [%rd2];
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rd4;
-; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5];
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1;
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1, %rd4;
; CHECK-PTX64-NEXT: ret;
@@ -99,10 +103,10 @@ define void @cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_2d_param_2];
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_2d_param_3];
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_2d_param_4];
-; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4}], [%r2];
+; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5];
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_2d_param_6];
+; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4}], [%r2];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rd2;
-; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1;
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1, %rd2;
; CHECK-PTX-SHARED32-NEXT: ret;
@@ -131,10 +135,10 @@ define void @cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_3d_param_3];
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_3d_param_4];
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_5];
-; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2];
+; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6];
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_3d_param_7];
+; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2];
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rd4;
-; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6];
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1;
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1, %rd4;
; CHECK-PTX64-NEXT: ret;
@@ -152,10 +156,10 @@ define void @cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_3];
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_3d_param_4];
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_3d_param_5];
-; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2];
+; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6];
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_3d_param_7];
+; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rd2;
-; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1;
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1, %rd2;
; CHECK-PTX-SHARED32-NEXT: ret;
@@ -185,10 +189,10 @@ define void @cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_4d_param_4];
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_4d_param_5];
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_6];
-; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2];
+; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7];
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_4d_param_8];
+; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2];
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rd4;
-; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7];
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1;
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1, %rd4;
; CHECK-PTX64-NEXT: ret;
@@ -207,10 +211,10 @@ define void @cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_4];
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_4d_param_5];
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_tile_4d_param_6];
-; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2];
+; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7];
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_4d_param_8];
+; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rd2;
-; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1;
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1, %rd2;
; CHECK-PTX-SHARED32-NEXT: ret;
@@ -241,10 +245,10 @@ define void @cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_5d_param_5];
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_5d_param_6];
; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_7];
-; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2];
+; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8];
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_5d_param_9];
+; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2];
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4;
-; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8];
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1;
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4;
; CHECK-PTX64-NEXT: ret;
@@ -264,10 +268,10 @@ define void @cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_5];
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_tile_5d_param_6];
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_tile_5d_param_7];
-; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2];
+; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8];
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_5d_param_9];
+; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2;
-; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1;
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2;
; CHECK-PTX-SHARED32-NEXT: ret;
@@ -297,10 +301,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspa
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2col_3d_param_4];
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2col_3d_param_5];
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6];
-; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1};
+; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7];
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_3d_param_8];
+; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1};
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rd4;
-; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7];
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2;
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2, %rd4;
; CHECK-PTX64-NEXT: ret;
@@ -319,10 +323,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspa
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_3d_param_4];
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2col_3d_param_5];
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6];
-; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1};
+; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7];
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_3d_param_8];
+; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1};
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rd2;
-; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2;
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2, %rd2;
; CHECK-PTX-SHARED32-NEXT: ret;
@@ -354,10 +358,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspa
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_4d_param_6];
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7];
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8];
-; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9];
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_4d_param_10];
+; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
-; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9];
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3;
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
; CHECK-PTX64-NEXT: ret;
@@ -378,10 +382,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspa
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2col_4d_param_6];
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7];
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8];
-; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9];
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_4d_param_10];
+; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
-; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3;
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
; CHECK-PTX-SHARED32-NEXT: ret;
@@ -415,10 +419,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspa
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8];
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9];
; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10];
-; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3};
+; CHECK-PTX64-NEXT: ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11];
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_5d_param_12];
+; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3};
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rd4;
-; CHECK-PTX64-NEXT: ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11];
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4;
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4, %rd4;
; CHECK-PTX64-NEXT: ret;
@@ -441,10 +445,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspa
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8];
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9];
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10];
-; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3};
+; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11];
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_5d_param_12];
+; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3};
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rd2;
-; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4;
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4, %rd2;
; CHECK-PTX-SHARED32-NEXT: ret;
diff --git a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll
index c3183a1..c50a0fb3 100644
--- a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll
@@ -7,11 +7,11 @@
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .half 0
; Num Functions
-; CHECK-NEXT: .word 12
+; CHECK-NEXT: .word 13
; Num LargeConstants
-; CHECK-NEXT: .word 2
+; CHECK-NEXT: .word 3
; Num Callsites
-; CHECK-NEXT: .word 16
+; CHECK-NEXT: .word 17
; Functions and stack size
; CHECK-NEXT: .quad constantargs
@@ -38,8 +38,8 @@
; CHECK-NEXT: .quad liveConstant
; CHECK-NEXT: .quad 0
; CHECK-NEXT: .quad 1
-; CHECK-NEXT: .quad spilledValue
-; CHECK-NEXT: .quad 144
+; CHECK-NEXT: .quad liveArgs
+; CHECK-NEXT: .quad 0
; CHECK-NEXT: .quad 1
; CHECK-NEXT: .quad directFrameIdx
; CHECK-NEXT: .quad 48
@@ -50,10 +50,14 @@
; CHECK-NEXT: .quad needsStackRealignment
; CHECK-NEXT: .quad -1
; CHECK-NEXT: .quad 1
+; CHECK-NEXT: .quad floats
+; CHECK-NEXT: .quad 32
+; CHECK-NEXT: .quad 1
; Num LargeConstants
; CHECK-NEXT: .quad 4294967295
; CHECK-NEXT: .quad 4294967296
+; CHECK-NEXT: .quad 4609434218613702656
; Constant arguments
;
@@ -278,7 +282,7 @@ define void @liveConstant() {
;
; Verify 28 stack map entries.
;
-; CHECK-LABEL: .word .L{{.*}}-spilledValue
+; CHECK-LABEL: .word .L{{.*}}-liveArgs
; CHECK-NEXT: .half 0
; CHECK-NEXT: .half 28
;
@@ -290,7 +294,7 @@ define void @liveConstant() {
; CHECK-NEXT: .half 2
; CHECK-NEXT: .half 0
; CHECK-NEXT: .word
-define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i8 %l25, i16 zeroext %l26, i32 signext %l27) {
+define void @liveArgs(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i8 %l25, i16 zeroext %l26, i32 signext %l27) {
entry:
call void (i64, i32, ptr, i32, ...) @llvm.experimental.patchpoint.void(i64 11, i32 28, ptr null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i8 %l25, i16 %l26, i32 %l27)
ret void
@@ -379,6 +383,104 @@ define void @needsStackRealignment() {
}
declare void @escape_values(...)
+; CHECK-LABEL: .word .L{{.*}}-floats
+; CHECK-NEXT: .half 0
+; Num Locations
+; CHECK-NEXT: .half 12
+; Loc 0: constant float as constant integer
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .half 8
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .word
+; Loc 1: constant double as large constant integer
+; CHECK-NEXT: .byte 5
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .half 8
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .word
+; Loc 2: constant half as constant integer
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .half 8
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .word
+; Loc 3: constant bfloat as constant integer
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .half 8
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .word
+; Loc 4: float value in X register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .half 8
+; CHECK-NEXT: .half 10
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .word
+; Loc 5: double value in X register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .half 8
+; CHECK-NEXT: .half 11
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .word
+; Loc 6: half value in X register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .half 8
+; CHECK-NEXT: .half 12
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .word
+; Loc 7: bfloat value in X register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .half 8
+; CHECK-NEXT: .half 13
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .word
+; Loc 8: float on stack
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .half 8
+; CHECK-NEXT: .half 2
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .word
+; Loc 9: double on stack
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .half 8
+; CHECK-NEXT: .half 2
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .word
+; Loc 10: half on stack
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .half 8
+; CHECK-NEXT: .half 2
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .word
+; Loc 11: bfloat on stack
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .half 8
+; CHECK-NEXT: .half 2
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .word
+define void @floats(float %f, double %g, half %h, bfloat %i) {
+ %ff = alloca float
+ %gg = alloca double
+ %hh = alloca half
+ %ii = alloca bfloat
+ call void (i64, i32, ...) @llvm.experimental.stackmap(i64 888, i32 0, float 1.25,
+ double 1.5, half 1.5, bfloat 1.5, float %f, double %g, half %h, bfloat %i, ptr %ff, ptr %gg, ptr %hh, ptr %ii)
+ ret void
+}
+
declare void @llvm.experimental.stackmap(i64, i32, ...)
declare void @llvm.experimental.patchpoint.void(i64, i32, ptr, i32, ...)
declare i64 @llvm.experimental.patchpoint.i64(i64, i32, ptr, i32, ...)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index 4c35b25..7e6f2c7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -15265,6 +15265,259 @@ define <4 x i32> @masked_gather_widen_sew_negative_stride(ptr %base) {
ret <4 x i32> %x
}
+define <7 x i8> @mgather_baseidx_v7i8(ptr %base, <7 x i8> %idxs, <7 x i1> %m, <7 x i8> %passthru) {
+; RV32-LABEL: mgather_baseidx_v7i8:
+; RV32: # %bb.0:
+; RV32-NEXT: li a1, 127
+; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: vmv.s.x v10, a1
+; RV32-NEXT: vmand.mm v0, v0, v10
+; RV32-NEXT: vsext.vf4 v10, v8
+; RV32-NEXT: vsetvli zero, zero, e8, mf2, ta, mu
+; RV32-NEXT: vluxei32.v v9, (a0), v10, v0.t
+; RV32-NEXT: vmv1r.v v8, v9
+; RV32-NEXT: ret
+;
+; RV64V-LABEL: mgather_baseidx_v7i8:
+; RV64V: # %bb.0:
+; RV64V-NEXT: li a1, 127
+; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; RV64V-NEXT: vmv.s.x v10, a1
+; RV64V-NEXT: vmand.mm v0, v0, v10
+; RV64V-NEXT: vsext.vf8 v12, v8
+; RV64V-NEXT: vsetvli zero, zero, e8, mf2, ta, mu
+; RV64V-NEXT: vluxei64.v v9, (a0), v12, v0.t
+; RV64V-NEXT: vmv1r.v v8, v9
+; RV64V-NEXT: ret
+;
+; RV64ZVE32F-LABEL: mgather_baseidx_v7i8:
+; RV64ZVE32F: # %bb.0:
+; RV64ZVE32F-NEXT: addi sp, sp, -16
+; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16
+; RV64ZVE32F-NEXT: .cfi_remember_state
+; RV64ZVE32F-NEXT: li a1, 64
+; RV64ZVE32F-NEXT: addi a2, sp, 8
+; RV64ZVE32F-NEXT: vsetvli zero, a1, e8, m4, ta, ma
+; RV64ZVE32F-NEXT: vsm.v v0, (a2)
+; RV64ZVE32F-NEXT: ld a1, 8(sp)
+; RV64ZVE32F-NEXT: andi a2, a1, 1
+; RV64ZVE32F-NEXT: beqz a2, .LBB132_2
+; RV64ZVE32F-NEXT: # %bb.1: # %cond.load
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: lbu a2, 0(a2)
+; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1
+; RV64ZVE32F-NEXT: vmv.v.x v11, a2
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2
+; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 3
+; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4
+; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 5
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 6
+; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslide1down.vx v10, v11, a2
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vslide1down.vx v9, v10, a2
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: .LBB132_2: # %else
+; RV64ZVE32F-NEXT: andi a2, a1, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB132_4
+; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 2
+; RV64ZVE32F-NEXT: vmv.x.s a3, v10
+; RV64ZVE32F-NEXT: add a3, a0, a3
+; RV64ZVE32F-NEXT: lbu a3, 0(a3)
+; RV64ZVE32F-NEXT: vmv.v.x v10, a2
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 3
+; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a3
+; RV64ZVE32F-NEXT: vmv.x.s a3, v11
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 4
+; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a2
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 5
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 6
+; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a3
+; RV64ZVE32F-NEXT: vmv.x.s a3, v11
+; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a2
+; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vslide1down.vx v9, v10, a2
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: .LBB132_4: # %else2
+; RV64ZVE32F-NEXT: andi a2, a1, 4
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB132_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
+; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 3
+; RV64ZVE32F-NEXT: vmv.x.s a4, v11
+; RV64ZVE32F-NEXT: vmv.v.x v11, a3
+; RV64ZVE32F-NEXT: vmv.x.s a3, v12
+; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 4
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: lbu a2, 0(a2)
+; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a4
+; RV64ZVE32F-NEXT: vmv.x.s a4, v12
+; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 5
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 6
+; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2
+; RV64ZVE32F-NEXT: vmv.x.s a2, v12
+; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a3
+; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a4
+; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vslide1down.vx v9, v11, a2
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: .LBB132_6: # %else5
+; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: beqz a2, .LBB132_8
+; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
+; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vmv.x.s a3, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2
+; RV64ZVE32F-NEXT: vmv.x.s a4, v11
+; RV64ZVE32F-NEXT: vmv.v.x v11, a2
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4
+; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a4
+; RV64ZVE32F-NEXT: vmv.x.s a4, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 5
+; RV64ZVE32F-NEXT: add a3, a0, a3
+; RV64ZVE32F-NEXT: lbu a3, 0(a3)
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 6
+; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslide1down.vx v10, v11, a3
+; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a4
+; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a2
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vslide1down.vx v9, v10, a2
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: .LBB132_8: # %else8
+; RV64ZVE32F-NEXT: andi a2, a1, 16
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
+; RV64ZVE32F-NEXT: bnez a2, .LBB132_13
+; RV64ZVE32F-NEXT: # %bb.9: # %else11
+; RV64ZVE32F-NEXT: andi a2, a1, 32
+; RV64ZVE32F-NEXT: bnez a2, .LBB132_14
+; RV64ZVE32F-NEXT: .LBB132_10: # %else14
+; RV64ZVE32F-NEXT: andi a1, a1, 64
+; RV64ZVE32F-NEXT: beqz a1, .LBB132_12
+; RV64ZVE32F-NEXT: .LBB132_11: # %cond.load16
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a1, v9
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vmv.v.x v8, a1
+; RV64ZVE32F-NEXT: vmv.x.s a1, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2
+; RV64ZVE32F-NEXT: add a0, a0, a2
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 3
+; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT: vmv.x.s a1, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 5
+; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: lbu a0, 0(a0)
+; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT: vmv.x.s a1, v9
+; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: .LBB132_12: # %else17
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64ZVE32F-NEXT: vmv1r.v v8, v9
+; RV64ZVE32F-NEXT: addi sp, sp, 16
+; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 0
+; RV64ZVE32F-NEXT: ret
+; RV64ZVE32F-NEXT: .LBB132_13: # %cond.load10
+; RV64ZVE32F-NEXT: .cfi_restore_state
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 2
+; RV64ZVE32F-NEXT: vmv.x.s a4, v10
+; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT: vmv.v.x v10, a3
+; RV64ZVE32F-NEXT: vmv.x.s a3, v11
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 3
+; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a4
+; RV64ZVE32F-NEXT: vmv.x.s a4, v11
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 5
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 6
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: lbu a2, 0(a2)
+; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a3
+; RV64ZVE32F-NEXT: vmv.x.s a3, v11
+; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a4
+; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a2
+; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vslide1down.vx v9, v10, a2
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: andi a2, a1, 32
+; RV64ZVE32F-NEXT: beqz a2, .LBB132_10
+; RV64ZVE32F-NEXT: .LBB132_14: # %cond.load13
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1
+; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vmv.x.s a3, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2
+; RV64ZVE32F-NEXT: vmv.x.s a4, v11
+; RV64ZVE32F-NEXT: vmv.v.x v11, a2
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 3
+; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a4
+; RV64ZVE32F-NEXT: vmv.x.s a4, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 6
+; RV64ZVE32F-NEXT: add a3, a0, a3
+; RV64ZVE32F-NEXT: lbu a3, 0(a3)
+; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslide1down.vx v10, v11, a4
+; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a2
+; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vslide1down.vx v9, v10, a2
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: andi a1, a1, 64
+; RV64ZVE32F-NEXT: bnez a1, .LBB132_11
+; RV64ZVE32F-NEXT: j .LBB132_12
+ %ptrs = getelementptr inbounds i8, ptr %base, <7 x i8> %idxs
+ %v = call <7 x i8> @llvm.masked.gather.v7i8.v7p0(<7 x ptr> %ptrs, i32 1, <7 x i1> %m, <7 x i8> %passthru)
+ ret <7 x i8> %v
+}
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; RV32V-ZVFH: {{.*}}
; RV32V-ZVFHMIN: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr165232.ll b/llvm/test/CodeGen/RISCV/rvv/pr165232.ll
new file mode 100644
index 0000000..bef53c6
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/pr165232.ll
@@ -0,0 +1,244 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s
+
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-linux-gnu"
+
+define i1 @main(ptr %var_117, ptr %arrayinit.element3045, ptr %arrayinit.element3047, ptr %arrayinit.element3049, ptr %arrayinit.element3051, ptr %arrayinit.element3053, ptr %arrayinit.element3055, ptr %arrayinit.element3057, ptr %arrayinit.element3059, ptr %arrayinit.element3061, ptr %arrayinit.element3063, ptr %arrayinit.element3065, ptr %arrayinit.element3067, i64 %var_94_i.07698, target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %1) {
+; CHECK-LABEL: main:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr t0, vlenb
+; CHECK-NEXT: slli t0, t0, 3
+; CHECK-NEXT: mv t1, t0
+; CHECK-NEXT: slli t0, t0, 1
+; CHECK-NEXT: add t0, t0, t1
+; CHECK-NEXT: sub sp, sp, t0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT: sd a1, 8(sp) # 8-byte Folded Spill
+; CHECK-NEXT: sd a2, 0(sp) # 8-byte Folded Spill
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs4r.v v12, (a1) # vscale x 32-byte Folded Spill
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 2
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: vs4r.v v16, (a1) # vscale x 32-byte Folded Spill
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 2
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: ld t0, 56(a1)
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: ld t1, 48(a1)
+; CHECK-NEXT: vsetvli t2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v9, 0
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: ld t2, 40(a1)
+; CHECK-NEXT: # kill: def $v10 killed $v9 killed $vtype
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: ld t3, 32(a1)
+; CHECK-NEXT: vmv.v.i v11, 0
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: ld t4, 16(a1)
+; CHECK-NEXT: vmv.v.i v12, 0
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: mv a2, a1
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: ld t5, 24(a1)
+; CHECK-NEXT: vmv.v.i v13, 0
+; CHECK-NEXT: vsetvli t6, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmv.v.i v22, 0
+; CHECK-NEXT: vmv1r.v v14, v9
+; CHECK-NEXT: sd zero, 0(a0)
+; CHECK-NEXT: vmv.v.i v24, 0
+; CHECK-NEXT: vmv1r.v v15, v9
+; CHECK-NEXT: vmv1r.v v18, v9
+; CHECK-NEXT: li t6, 1023
+; CHECK-NEXT: vmv.v.i v26, 0
+; CHECK-NEXT: vmv1r.v v19, v9
+; CHECK-NEXT: slli t6, t6, 52
+; CHECK-NEXT: vmv.v.i v28, 0
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs2r.v v22, (a1) # vscale x 16-byte Folded Spill
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 1
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: vs4r.v v24, (a1) # vscale x 32-byte Folded Spill
+; CHECK-NEXT: slli a2, a2, 1
+; CHECK-NEXT: add a1, a1, a2
+; CHECK-NEXT: ld a2, 0(sp) # 8-byte Folded Reload
+; CHECK-NEXT: vs2r.v v28, (a1) # vscale x 16-byte Folded Spill
+; CHECK-NEXT: ld a1, 8(sp) # 8-byte Folded Reload
+; CHECK-NEXT: vmv1r.v v20, v9
+; CHECK-NEXT: sd t6, 0(t5)
+; CHECK-NEXT: vmv2r.v v16, v14
+; CHECK-NEXT: vmv2r.v v14, v12
+; CHECK-NEXT: vmv2r.v v12, v10
+; CHECK-NEXT: vmv1r.v v11, v9
+; CHECK-NEXT: vmv1r.v v21, v9
+; CHECK-NEXT: csrr t5, vlenb
+; CHECK-NEXT: slli t5, t5, 3
+; CHECK-NEXT: add t5, sp, t5
+; CHECK-NEXT: addi t5, t5, 16
+; CHECK-NEXT: vs2r.v v18, (t5) # vscale x 16-byte Folded Spill
+; CHECK-NEXT: csrr t6, vlenb
+; CHECK-NEXT: slli t6, t6, 1
+; CHECK-NEXT: add t5, t5, t6
+; CHECK-NEXT: vs2r.v v20, (t5) # vscale x 16-byte Folded Spill
+; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v19, 0
+; CHECK-NEXT: vmclr.m v10
+; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT: vmv.v.i v6, 0
+; CHECK-NEXT: .LBB0_1: # %for.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmv1r.v v20, v19
+; CHECK-NEXT: vmv1r.v v3, v19
+; CHECK-NEXT: vmv1r.v v5, v19
+; CHECK-NEXT: vmv1r.v v2, v19
+; CHECK-NEXT: vmv1r.v v31, v19
+; CHECK-NEXT: vmv1r.v v30, v19
+; CHECK-NEXT: vmv1r.v v4, v19
+; CHECK-NEXT: vmv2r.v v22, v10
+; CHECK-NEXT: vmv4r.v v24, v12
+; CHECK-NEXT: vmv2r.v v28, v16
+; CHECK-NEXT: vmv2r.v v8, v6
+; CHECK-NEXT: vmv1r.v v18, v19
+; CHECK-NEXT: vmv1r.v v21, v10
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma
+; CHECK-NEXT: vle32.v v20, (t4)
+; CHECK-NEXT: vle32.v v3, (t1)
+; CHECK-NEXT: vle32.v v30, (a7)
+; CHECK-NEXT: vle64.v v8, (a4)
+; CHECK-NEXT: vle32.v v5, (t2)
+; CHECK-NEXT: vle32.v v2, (t3)
+; CHECK-NEXT: vle32.v v31, (a6)
+; CHECK-NEXT: vmv1r.v v24, v30
+; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu
+; CHECK-NEXT: vmflt.vv v21, v8, v6, v0.t
+; CHECK-NEXT: vmv1r.v v8, v19
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, mu
+; CHECK-NEXT: vle32.v v18, (a2)
+; CHECK-NEXT: vle32.v v8, (a3)
+; CHECK-NEXT: vle32.v v4, (a5)
+; CHECK-NEXT: vmv1r.v v22, v20
+; CHECK-NEXT: csrr t5, vlenb
+; CHECK-NEXT: slli t5, t5, 3
+; CHECK-NEXT: add t5, sp, t5
+; CHECK-NEXT: addi t5, t5, 16
+; CHECK-NEXT: vl1r.v v1, (t5) # vscale x 8-byte Folded Reload
+; CHECK-NEXT: csrr t6, vlenb
+; CHECK-NEXT: add t5, t5, t6
+; CHECK-NEXT: vl2r.v v2, (t5) # vscale x 16-byte Folded Reload
+; CHECK-NEXT: slli t6, t6, 1
+; CHECK-NEXT: add t5, t5, t6
+; CHECK-NEXT: vl1r.v v4, (t5) # vscale x 8-byte Folded Reload
+; CHECK-NEXT: vsseg4e32.v v1, (zero)
+; CHECK-NEXT: vsseg8e32.v v22, (a1)
+; CHECK-NEXT: vmv1r.v v0, v21
+; CHECK-NEXT: vssub.vv v8, v19, v18, v0.t
+; CHECK-NEXT: csrr t5, vlenb
+; CHECK-NEXT: slli t5, t5, 2
+; CHECK-NEXT: mv t6, t5
+; CHECK-NEXT: slli t5, t5, 1
+; CHECK-NEXT: add t5, t5, t6
+; CHECK-NEXT: add t5, sp, t5
+; CHECK-NEXT: addi t5, t5, 16
+; CHECK-NEXT: vl4r.v v20, (t5) # vscale x 32-byte Folded Reload
+; CHECK-NEXT: vsetvli zero, t0, e64, m2, ta, ma
+; CHECK-NEXT: vsseg2e64.v v20, (zero)
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: addi t5, sp, 16
+; CHECK-NEXT: vl4r.v v20, (t5) # vscale x 32-byte Folded Reload
+; CHECK-NEXT: csrr t6, vlenb
+; CHECK-NEXT: slli t6, t6, 2
+; CHECK-NEXT: add t5, t5, t6
+; CHECK-NEXT: vl4r.v v24, (t5) # vscale x 32-byte Folded Reload
+; CHECK-NEXT: vsetivli zero, 0, e64, m2, ta, ma
+; CHECK-NEXT: vsseg4e64.v v20, (zero), v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vsseg8e32.v v8, (a0)
+; CHECK-NEXT: csrr t5, vlenb
+; CHECK-NEXT: slli t5, t5, 4
+; CHECK-NEXT: add t5, sp, t5
+; CHECK-NEXT: addi t5, t5, 16
+; CHECK-NEXT: vl4r.v v20, (t5) # vscale x 32-byte Folded Reload
+; CHECK-NEXT: csrr t6, vlenb
+; CHECK-NEXT: slli t6, t6, 2
+; CHECK-NEXT: add t5, t5, t6
+; CHECK-NEXT: vl4r.v v24, (t5) # vscale x 32-byte Folded Reload
+; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT: vsseg4e64.v v20, (zero)
+; CHECK-NEXT: j .LBB0_1
+entry:
+ store double 0.000000e+00, ptr %var_117, align 8
+ store double 1.000000e+00, ptr %arrayinit.element3061, align 8
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %2 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3059, i64 0)
+ %3 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3067, i64 0)
+ %4 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3065, i64 0)
+ %5 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3063, i64 0)
+ %6 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3055, i64 0)
+ %7 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3057, i64 0)
+ %8 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3053, i64 0)
+ %9 = call <vscale x 2 x double> @llvm.riscv.vle.nxv2f64.p0.i64(<vscale x 2 x double> zeroinitializer, ptr %arrayinit.element3051, i64 0)
+ %10 = tail call <vscale x 2 x i32> @llvm.riscv.vle.nxv2i32.p0.i64(<vscale x 2 x i32> zeroinitializer, ptr %arrayinit.element3047, i64 0)
+ %11 = tail call <vscale x 2 x i32> @llvm.riscv.vle.nxv2i32.p0.i64(<vscale x 2 x i32> zeroinitializer, ptr %arrayinit.element3049, i64 0)
+ call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv8i8_4t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) zeroinitializer, ptr null, i64 0, i64 5)
+ %12 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) zeroinitializer, <vscale x 2 x float> %8, i32 0)
+ %13 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %12, <vscale x 2 x float> %7, i32 2)
+ %14 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %13, <vscale x 2 x float> %6, i32 0)
+ %15 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %14, <vscale x 2 x float> %5, i32 0)
+ %16 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %15, <vscale x 2 x float> %4, i32 0)
+ %17 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %16, <vscale x 2 x float> %3, i32 0)
+ %18 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %17, <vscale x 2 x float> %2, i32 0)
+ call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv8i8_8t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %18, ptr %arrayinit.element3045, i64 0, i64 5)
+ %19 = tail call <vscale x 2 x i1> @llvm.riscv.vmfgt.mask.nxv2f64.nxv2f64.i64(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> %9, <vscale x 2 x i1> zeroinitializer, i64 0)
+ %20 = tail call <vscale x 2 x i32> @llvm.riscv.vssub.mask.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> %11, <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> %10, <vscale x 2 x i1> %19, i64 0, i64 0)
+ call void @llvm.riscv.vsseg2.triscv.vector.tuple_nxv16i8_2t.p0.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, ptr null, i64 %var_94_i.07698, i64 6)
+ call void @llvm.riscv.vsseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) zeroinitializer, ptr null, <vscale x 2 x i1> zeroinitializer, i64 0, i64 6)
+ %21 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, <vscale x 2 x i32> %20, i32 0)
+ call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv8i8_8t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %21, ptr %var_117, i64 0, i64 5)
+ call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv16i8_4t.p0.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %1, ptr null, i64 0, i64 6)
+ br label %for.body
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/zvlsseg-spill.mir b/llvm/test/CodeGen/RISCV/rvv/zvlsseg-spill.mir
index dd9960d..9c2fa9d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/zvlsseg-spill.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/zvlsseg-spill.mir
@@ -32,10 +32,10 @@ body: |
; CHECK-NEXT: $x11 = ADDI $x2, 16
; CHECK-NEXT: VS4R_V $v0m4, $x11, implicit $v0_v1_v2_v3_v4_v5_v6 :: (store (<vscale x 1 x s256>) into %stack.0, align 8)
; CHECK-NEXT: $x12 = PseudoReadVLENB
- ; CHECK-NEXT: $x13 = SLLI $x12, 2
- ; CHECK-NEXT: $x11 = ADD killed $x11, killed $x13
+ ; CHECK-NEXT: $x12 = SLLI killed $x12, 2
+ ; CHECK-NEXT: $x11 = ADD killed $x11, $x12
; CHECK-NEXT: VS2R_V $v4m2, $x11, implicit $v0_v1_v2_v3_v4_v5_v6 :: (store (<vscale x 1 x s128>) into %stack.0, align 8)
- ; CHECK-NEXT: $x12 = SLLI killed $x12, 1
+ ; CHECK-NEXT: $x12 = SRLI killed $x12, 1
; CHECK-NEXT: $x11 = ADD killed $x11, killed $x12
; CHECK-NEXT: VS1R_V $v6, killed $x11, implicit $v0_v1_v2_v3_v4_v5_v6 :: (store (<vscale x 1 x s64>) into %stack.0)
; CHECK-NEXT: $x11 = ADDI $x2, 16
@@ -93,10 +93,10 @@ body: |
; CHECK-NEXT: $x11 = ADDI $x2, 16
; CHECK-NEXT: $v10m2 = VL2RE8_V $x11 :: (load (<vscale x 1 x s128>) from %stack.0, align 8)
; CHECK-NEXT: $x12 = PseudoReadVLENB
- ; CHECK-NEXT: $x13 = SLLI $x12, 1
- ; CHECK-NEXT: $x11 = ADD killed $x11, killed $x13
+ ; CHECK-NEXT: $x12 = SLLI killed $x12, 1
+ ; CHECK-NEXT: $x11 = ADD killed $x11, $x12
; CHECK-NEXT: $v12m4 = VL4RE8_V $x11 :: (load (<vscale x 1 x s256>) from %stack.0, align 8)
- ; CHECK-NEXT: $x12 = SLLI killed $x12, 2
+ ; CHECK-NEXT: $x12 = SLLI killed $x12, 1
; CHECK-NEXT: $x11 = ADD killed $x11, killed $x12
; CHECK-NEXT: $v16 = VL1RE8_V killed $x11 :: (load (<vscale x 1 x s64>) from %stack.0)
; CHECK-NEXT: VS1R_V killed $v10, killed renamable $x10
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/issue-146942-ptr-cast.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/issue-146942-ptr-cast.ll
index ed67344..4817e74 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/issue-146942-ptr-cast.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/issue-146942-ptr-cast.ll
@@ -16,7 +16,6 @@
define void @case1() local_unnamed_addr {
; CHECK: %[[#BUFFER_LOAD:]] = OpLoad %[[#FLOAT4]] %{{[0-9]+}} Aligned 16
; CHECK: %[[#CAST_LOAD:]] = OpBitcast %[[#INT4]] %[[#BUFFER_LOAD]]
- ; CHECK: %[[#VEC_SHUFFLE:]] = OpVectorShuffle %[[#INT4]] %[[#CAST_LOAD]] %[[#CAST_LOAD]] 0 1 2 3
%1 = tail call target("spirv.VulkanBuffer", [0 x <4 x float>], 12, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v4f32_12_0t(i32 0, i32 2, i32 1, i32 0, ptr nonnull @.str)
%2 = tail call target("spirv.VulkanBuffer", [0 x <4 x i32>], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v4i32_12_1t(i32 0, i32 5, i32 1, i32 0, ptr nonnull @.str.2)
%3 = tail call noundef align 16 dereferenceable(16) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0v4f32_12_0t(target("spirv.VulkanBuffer", [0 x <4 x float>], 12, 0) %1, i32 0)
@@ -29,8 +28,7 @@ define void @case1() local_unnamed_addr {
define void @case2() local_unnamed_addr {
; CHECK: %[[#BUFFER_LOAD:]] = OpLoad %[[#FLOAT4]] %{{[0-9]+}} Aligned 16
; CHECK: %[[#CAST_LOAD:]] = OpBitcast %[[#INT4]] %[[#BUFFER_LOAD]]
- ; CHECK: %[[#VEC_SHUFFLE:]] = OpVectorShuffle %[[#INT4]] %[[#CAST_LOAD]] %[[#CAST_LOAD]] 0 1 2 3
- ; CHECK: %[[#VEC_TRUNCATE:]] = OpVectorShuffle %[[#INT3]] %[[#VEC_SHUFFLE]] %[[#UNDEF_INT4]] 0 1 2
+ ; CHECK: %[[#VEC_TRUNCATE:]] = OpVectorShuffle %[[#INT3]] %[[#CAST_LOAD]] %[[#UNDEF_INT4]] 0 1 2
%1 = tail call target("spirv.VulkanBuffer", [0 x <4 x float>], 12, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v4f32_12_0t(i32 0, i32 2, i32 1, i32 0, ptr nonnull @.str)
%2 = tail call target("spirv.VulkanBuffer", [0 x <3 x i32>], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v3i32_12_1t(i32 0, i32 5, i32 1, i32 0, ptr nonnull @.str.3)
%3 = tail call noundef align 16 dereferenceable(16) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0v4f32_12_0t(target("spirv.VulkanBuffer", [0 x <4 x float>], 12, 0) %1, i32 0)
diff --git a/llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll b/llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll
index 8491328..a1ec2cd 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll
@@ -26,3 +26,25 @@ entry:
store <4 x i32> %6, ptr addrspace(11) %7, align 16
ret void
}
+
+; This tests a load from a pointer that has been bitcast between vector types
+; which share the same total bit-width but have different numbers of elements.
+; Tests that legalize-pointer-casts works correctly by moving the bitcast to
+; the element that was loaded.
+
+define void @main2() local_unnamed_addr #0 {
+entry:
+; CHECK: %[[LOAD:[0-9]+]] = OpLoad %[[#v2_double]] {{.*}}
+; CHECK: %[[BITCAST1:[0-9]+]] = OpBitcast %[[#v4_uint]] %[[LOAD]]
+; CHECK: %[[BITCAST2:[0-9]+]] = OpBitcast %[[#v2_double]] %[[BITCAST1]]
+; CHECK: OpStore {{%[0-9]+}} %[[BITCAST2]] {{.*}}
+
+ %0 = tail call target("spirv.VulkanBuffer", [0 x <2 x double>], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v2f64_12_1t(i32 0, i32 2, i32 1, i32 0, ptr nonnull @.str.2)
+ %2 = tail call noundef align 16 dereferenceable(16) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0v2f64_12_1t(target("spirv.VulkanBuffer", [0 x <2 x double>], 12, 1) %0, i32 0)
+ %3 = load <4 x i32>, ptr addrspace(11) %2
+ %4 = tail call noundef align 16 dereferenceable(16) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0v2f64_12_1t(target("spirv.VulkanBuffer", [0 x <2 x double>], 12, 1) %0, i32 1)
+ store <4 x i32> %3, ptr addrspace(11) %4
+ ret void
+}
+
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
diff --git a/llvm/test/CodeGen/SystemZ/stackmap.ll b/llvm/test/CodeGen/SystemZ/stackmap.ll
index 05b8de7..f414ea3 100644
--- a/llvm/test/CodeGen/SystemZ/stackmap.ll
+++ b/llvm/test/CodeGen/SystemZ/stackmap.ll
@@ -84,14 +84,14 @@
; CHECK-NEXT: .short 8
; CHECK-NEXT: .short 0
; CHECK-NEXT: .short 0
-; CHECK-NEXT: .long 65535
+; CHECK-NEXT: .long -1
; SmallConstant
; CHECK-NEXT: .byte 4
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .short 8
; CHECK-NEXT: .short 0
; CHECK-NEXT: .short 0
-; CHECK-NEXT: .long 65535
+; CHECK-NEXT: .long -1
; SmallConstant
; CHECK-NEXT: .byte 4
; CHECK-NEXT: .byte 0
diff --git a/llvm/test/CodeGen/X86/amx-tf32-internal.ll b/llvm/test/CodeGen/X86/amx-tf32-internal.ll
index 6d0f3c5..caf7a1c 100644
--- a/llvm/test/CodeGen/X86/amx-tf32-internal.ll
+++ b/llvm/test/CodeGen/X86/amx-tf32-internal.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+avx512f, \
-; RUN: -mattr=+amx-tf32,+amx-transpose -verify-machineinstrs | FileCheck %s
+; RUN: -mattr=+amx-tf32 -verify-machineinstrs | FileCheck %s
define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
; CHECK-LABEL: test_amx:
@@ -20,7 +20,6 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
; CHECK-NEXT: tilezero %tmm1
; CHECK-NEXT: tilezero %tmm2
; CHECK-NEXT: tmmultf32ps %tmm1, %tmm0, %tmm2
-; CHECK-NEXT: ttmmultf32ps %tmm1, %tmm0, %tmm2
; CHECK-NEXT: tilestored %tmm2, (%rdi,%rdx)
; CHECK-NEXT: tilerelease
; CHECK-NEXT: vzeroupper
@@ -31,9 +30,8 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
%c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
%c1 = call x86_amx @llvm.x86.tmmultf32ps.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b)
- %c2 = call x86_amx @llvm.x86.ttmmultf32ps.internal(i16 8, i16 8, i16 8, x86_amx %c1, x86_amx %a, x86_amx %b)
- call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c2)
+ call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c1)
ret void
}
@@ -43,4 +41,3 @@ declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
declare x86_amx @llvm.x86.tmmultf32ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
-declare x86_amx @llvm.x86.ttmmultf32ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
diff --git a/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll b/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll
index af1a7ae..642c1b7 100644
--- a/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-tf32,+amx-transpose -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-tf32 -verify-machineinstrs | FileCheck %s
define void @test_tmmultf32ps() {
; CHECK-LABEL: test_tmmultf32ps:
@@ -11,13 +11,3 @@ define void @test_tmmultf32ps() {
}
declare void @llvm.x86.tmmultf32ps(i8 %A, i8 %B, i8 %C)
-define void @test_ttmmultf32ps() {
-; CHECK-LABEL: test_ttmmultf32ps:
-; CHECK: # %bb.0:
-; CHECK-NEXT: ttmmultf32ps %tmm3, %tmm2, %tmm1
-; CHECK-NEXT: retq
- call void @llvm.x86.ttmmultf32ps(i8 1, i8 2, i8 3)
- ret void
-}
-declare void @llvm.x86.ttmmultf32ps(i8 %A, i8 %B, i8 %C)
-
diff --git a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
deleted file mode 100755
index 1f5758c..0000000
--- a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
+++ /dev/null
@@ -1,122 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O0
-; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O2
-; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR
-
-define void @test_amx(i64 %stride, i8* %addr1) #0 {
-; CHECK-LABEL: test_amx:
-; CHECK: # %bb.0:
-; CHECK-NEXT: t2rpntlvwz0rs (%rsi,%rdi), %tmm0
-; CHECK-NEXT: t2rpntlvwz0rst1 (%rsi,%rdi), %tmm2
-; CHECK-NEXT: t2rpntlvwz1rs (%rsi,%rdi), %tmm0
-; CHECK-NEXT: t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2
-; CHECK-NEXT: retq
-;
-; EGPR-LABEL: test_amx:
-; EGPR: # %bb.0:
-; EGPR-NEXT: t2rpntlvwz0rs (%rsi,%rdi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf8,0x04,0x3e]
-; EGPR-NEXT: t2rpntlvwz0rst1 (%rsi,%rdi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x3e]
-; EGPR-NEXT: t2rpntlvwz1rs (%rsi,%rdi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf8,0x04,0x3e]
-; EGPR-NEXT: t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x3e]
-; EGPR-NEXT: retq # encoding: [0xc3]
- call void @llvm.x86.t2rpntlvwz0rs(i8 1, i8* %addr1, i64 %stride)
- call void @llvm.x86.t2rpntlvwz0rst1(i8 2, i8* %addr1, i64 %stride)
- call void @llvm.x86.t2rpntlvwz1rs(i8 1, i8* %addr1, i64 %stride)
- call void @llvm.x86.t2rpntlvwz1rst1(i8 2, i8* %addr1, i64 %stride)
- ret void
-}
-declare void @llvm.x86.t2rpntlvwz0rs(i8 , i8* , i64 )
-declare void @llvm.x86.t2rpntlvwz0rst1(i8 , i8* , i64 )
-declare void @llvm.x86.t2rpntlvwz1rs(i8 , i8* , i64 )
-declare void @llvm.x86.t2rpntlvwz1rst1(i8 , i8* , i64 )
-
-define void @test_amx2(i8* %base, i64 %stride) #0 {
-; O0-LABEL: test_amx2:
-; O0: # %bb.0:
-; O0-NEXT: xorps %xmm0, %xmm0
-; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; O0-NEXT: movb $1, -{{[0-9]+}}(%rsp)
-; O0-NEXT: movw $8, %ax
-; O0-NEXT: # implicit-def: $al
-; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
-; O0-NEXT: # implicit-def: $al
-; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
-; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
-; O0-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4
-; O0-NEXT: movw $8, %ax
-; O0-NEXT: # implicit-def: $al
-; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
-; O0-NEXT: # implicit-def: $al
-; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
-; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
-; O0-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4
-; O0-NEXT: movw $8, %ax
-; O0-NEXT: # implicit-def: $al
-; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
-; O0-NEXT: # implicit-def: $al
-; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
-; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
-; O0-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4
-; O0-NEXT: tilerelease
-; O0-NEXT: retq
-;
-; O2-LABEL: test_amx2:
-; O2: # %bb.0:
-; O2-NEXT: xorps %xmm0, %xmm0
-; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; O2-NEXT: movb $1, -{{[0-9]+}}(%rsp)
-; O2-NEXT: movb $8, -{{[0-9]+}}(%rsp)
-; O2-NEXT: movw $8, -{{[0-9]+}}(%rsp)
-; O2-NEXT: movb $8, -{{[0-9]+}}(%rsp)
-; O2-NEXT: movw $8, -{{[0-9]+}}(%rsp)
-; O2-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
-; O2-NEXT: movw $8, %ax
-; O2-NEXT: t2rpntlvwz0rs (%rdi,%rsi), %tmm4
-; O2-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4
-; O2-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4
-; O2-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4
-; O2-NEXT: tilerelease
-; O2-NEXT: retq
-;
-; EGPR-LABEL: test_amx2:
-; EGPR: # %bb.0:
-; EGPR-NEXT: xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0]
-; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xc0]
-; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xd0]
-; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xe0]
-; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xf0]
-; EGPR-NEXT: movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xc0,0x01]
-; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf4,0x08]
-; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd8,0x08,0x00]
-; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf5,0x08]
-; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xda,0x08,0x00]
-; EGPR-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0xc0]
-; EGPR-NEXT: movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00]
-; EGPR-NEXT: t2rpntlvwz0rs (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf8,0x24,0x37]
-; EGPR-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf9,0x24,0x37]
-; EGPR-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf8,0x24,0x37]
-; EGPR-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf9,0x24,0x37]
-; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
-; EGPR-NEXT: retq # encoding: [0xc3]
- call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
- call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
- call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
- call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
- ret void
-}
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16, i16, i16, i8*, i64)
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16, i16, i16, i8*, i64)
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16, i16, i16, i8*, i64)
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16, i16, i16, i8*, i64)
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll b/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll
deleted file mode 100644
index 4f41410..0000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll
+++ /dev/null
@@ -1,136 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
-; RUN: -mattr=+amx-transpose -verify-machineinstrs | FileCheck %s
-
-@buf = dso_local global [2048 x i8] zeroinitializer, align 16
-@buf2 = dso_local global [2048 x i8] zeroinitializer, align 16
-
-define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 {
-; CHECK-LABEL: test_tile_2rpntlvwz0:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: pushq %rbp
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset %rbp, -16
-; CHECK-NEXT: movq %rsp, %rbp
-; CHECK-NEXT: .cfi_def_cfa_register %rbp
-; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00
-; CHECK-NEXT: subq $8192, %rsp # imm = 0x2000
-; CHECK-NEXT: .cfi_offset %rbx, -24
-; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: # kill: def $dx killed $dx killed $edx
-; CHECK-NEXT: movw %si, %cx
-; CHECK-NEXT: movw %di, %ax
-; CHECK-NEXT: # implicit-def: $al
-; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: # implicit-def: $al
-; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: # implicit-def: $al
-; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: # implicit-def: $cl
-; CHECK-NEXT: movb %cl, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: # implicit-def: $al
-; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: # implicit-def: $al
-; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: # implicit-def: $al
-; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: # implicit-def: $al
-; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movl $buf, %esi
-; CHECK-NEXT: movl $32, %edi
-; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdi), %tmm4
-; CHECK-NEXT: movabsq $64, %rbx
-; CHECK-NEXT: tilestored %tmm5, (%rsp,%rbx) # 1024-byte Folded Spill
-; CHECK-NEXT: tileloadd (%rsp,%rbx), %tmm0 # 1024-byte Folded Reload
-; CHECK-NEXT: movabsq $64, %rbx
-; CHECK-NEXT: tilestored %tmm4, 1024(%rsp,%rbx) # 1024-byte Folded Spill
-; CHECK-NEXT: tileloadd 1024(%rsp,%rbx), %tmm1 # 1024-byte Folded Reload
-; CHECK-NEXT: movl $64, %edi
-; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT: tilestored %tmm1, (%rsi,%rdi)
-; CHECK-NEXT: movl $64, %edi
-; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi)
-; CHECK-NEXT: tilezero %tmm0
-; CHECK-NEXT: movl $64, %edi
-; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi)
-; CHECK-NEXT: movl $64, %edi
-; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm1
-; CHECK-NEXT: movl $64, %edi
-; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm2
-; CHECK-NEXT: movl $64, %edi
-; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm0
-; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
-; CHECK-NEXT: movl $64, %edi
-; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi)
-; CHECK-NEXT: movl $64, %edi
-; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm0
-; CHECK-NEXT: movl $buf2, %edx
-; CHECK-NEXT: movl $32, %esi
-; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi)
-; CHECK-NEXT: leaq -8(%rbp), %rsp
-; CHECK-NEXT: popq %rbx
-; CHECK-NEXT: popq %rbp
-; CHECK-NEXT: .cfi_def_cfa %rsp, 8
-; CHECK-NEXT: tilerelease
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
-entry:
- %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr @buf, i64 32) #3
- %1 = extractvalue { x86_amx, x86_amx } %0, 0
- %2 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #3
- %3 = extractvalue { x86_amx, x86_amx } %0, 1
- %4 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #3
- %5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #3
- %6 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #3
- %7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6) #3
- %8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %2) #3
- %9 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4) #3
- %10 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %7, x86_amx %8, x86_amx %9) #3
- %11 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) #3
- %12 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #3
- tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, ptr @buf2, i64 32, x86_amx %12) #3
- ret void
-}
-
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #1
-
-declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2
-
-declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3
-
-declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3
-
-declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2
-
-declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #4
-
-attributes #0 = { nounwind uwtable "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose" }
-attributes #1 = { argmemonly nofree nounwind readonly }
-attributes #2 = { nofree nosync nounwind readnone }
-attributes #3 = { nounwind }
-attributes #4 = { argmemonly nounwind writeonly }
-
-!llvm.module.flags = !{!0, !1, !2}
-
-!0 = !{i32 1, !"wchar_size", i32 4}
-!1 = !{i32 7, !"uwtable", i32 2}
-!2 = !{i32 7, !"frame-pointer", i32 2}
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir
deleted file mode 100644
index ab12ab3..0000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir
+++ /dev/null
@@ -1,165 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
-# RUN: -mattr=+amx-transpose -run-pass=fasttileconfig -o - %s | FileCheck %s
-
----
-name: test_tile_2rpntlvwz0
-alignment: 16
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-failedISel: false
-tracksRegLiveness: true
-hasWinCFI: false
-callsEHReturn: false
-callsUnwindInit: false
-hasEHContTarget: false
-hasEHScopes: false
-hasEHFunclets: false
-failsVerification: false
-tracksDebugUserValues: false
-registers: []
-liveins:
- - { reg: '$edi', virtual-reg: '' }
- - { reg: '$esi', virtual-reg: '' }
- - { reg: '$edx', virtual-reg: '' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 1024
- adjustsStack: false
- hasCalls: true
- stackProtector: ''
- functionContext: ''
- maxCallFrameSize: 4294967295
- cvBytesOfCalleeSavedRegisters: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
- hasTailCall: false
- localFrameSize: 0
- savePoint: []
- restorePoint: []
-fixedStack: []
-stack:
- - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 8,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- - { id: 1, name: '', type: default, offset: 0, size: 8, alignment: 8,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- - { id: 2, name: '', type: default, offset: 0, size: 8, alignment: 8,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- - { id: 3, name: '', type: default, offset: 0, size: 8, alignment: 8,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- - { id: 4, name: '', type: default, offset: 0, size: 64, alignment: 4,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- - { id: 5, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- - { id: 6, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- - { id: 7, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-callSites: []
-debugValueSubstitutions: []
-constants: []
-machineFunctionInfo:
- amxProgModel: ManagedRA
-body: |
- bb.0.entry:
- liveins: $rdi, $rsi, $rdx, $rax
-
- ; CHECK-LABEL: name: test_tile_2rpntlvwz0
- ; CHECK: liveins: $rdi, $rsi, $rdx, $rax
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $zmm0 = AVX512_512_SET0
- ; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store (s512) into %stack.4, align 4)
- ; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4)
- ; CHECK-NEXT: renamable $rcx = MOV32ri64 64
- ; CHECK-NEXT: MOV64mr %stack.7, 1, $noreg, 0, $noreg, $rcx :: (store (s64) into %stack.7)
- ; CHECK-NEXT: renamable $cx = MOV16ri 64
- ; CHECK-NEXT: MOV16mr %stack.5, 1, $noreg, 0, $noreg, $cx :: (store (s16) into %stack.5)
- ; CHECK-NEXT: renamable $cx = MOV16ri 16
- ; CHECK-NEXT: renamable $r8w = MOV16ri 16
- ; CHECK-NEXT: MOV16mr %stack.6, 1, $noreg, 0, $noreg, $r8w :: (store (s16) into %stack.6)
- ; CHECK-NEXT: $al = IMPLICIT_DEF
- ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4)
- ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4)
- ; CHECK-NEXT: $al = IMPLICIT_DEF
- ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 50, $noreg, $al :: (store (s512) into %stack.4 + 50, align 2, basealign 4)
- ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 20, $noreg, $cx :: (store (s512) into %stack.4 + 20, align 4)
- ; CHECK-NEXT: $al = IMPLICIT_DEF
- ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 49, $noreg, $al :: (store (s512) into %stack.4 + 49, align 1, basealign 4)
- ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 18, $noreg, $di :: (store (s512) into %stack.4 + 18, align 2, basealign 4)
- ; CHECK-NEXT: $al = IMPLICIT_DEF
- ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4)
- ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4)
- ; CHECK-NEXT: $al = IMPLICIT_DEF
- ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4)
- ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4)
- ; CHECK-NEXT: $al = IMPLICIT_DEF
- ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 52, $noreg, $al :: (store (s512) into %stack.4 + 52, align 4)
- ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 24, $noreg, $cx :: (store (s512) into %stack.4 + 24, align 4)
- ; CHECK-NEXT: $al = IMPLICIT_DEF
- ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 53, $noreg, $al :: (store (s512) into %stack.4 + 53, align 1, basealign 4)
- ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 26, $noreg, $di :: (store (s512) into %stack.4 + 26, align 2, basealign 4)
- ; CHECK-NEXT: PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.4, align 4)
- ; CHECK-NEXT: renamable $r9 = COPY $rsi
- ; CHECK-NEXT: $rsi = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7)
- ; CHECK-NEXT: renamable $r8 = COPY $rdi
- ; CHECK-NEXT: $di = MOV16rm %stack.6, 1, $noreg, 0, $noreg :: (load (s16) from %stack.6)
- ; CHECK-NEXT: renamable $r10 = COPY $rax
- ; CHECK-NEXT: $ax = MOV16rm %stack.5, 1, $noreg, 0, $noreg :: (load (s16) from %stack.5)
- ; CHECK-NEXT: renamable $tmm4_tmm5 = PT2RPNTLVWZ0V renamable $ax, renamable $cx, renamable $di, renamable $rdx, 1, killed renamable $r10, 0, $noreg
- ; CHECK-NEXT: renamable $tmm0 = COPY renamable $tmm5
- ; CHECK-NEXT: renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5
- ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, renamable $r9, 1, renamable $rsi, 0, $noreg, killed renamable $tmm1
- ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $di, renamable $r8, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0
- ; CHECK-NEXT: renamable $tmm0 = PTILEZEROV renamable $ax, renamable $cx
- ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0
- ; CHECK-NEXT: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg
- ; CHECK-NEXT: renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r8, 1, renamable $rsi, 0, $noreg
- ; CHECK-NEXT: renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg
- ; CHECK-NEXT: renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2
- ; CHECK-NEXT: PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0
- renamable $zmm0 = AVX512_512_SET0
- VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store (s512) into %stack.4, align 4)
- MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4)
- renamable $rcx = MOV32ri64 64
- MOV64mr %stack.7, 1, $noreg, 0, $noreg, $rcx :: (store (s64) into %stack.7)
- renamable $cx = MOV16ri 64
- MOV16mr %stack.5, 1, $noreg, 0, $noreg, $cx :: (store (s16) into %stack.5)
- renamable $cx = MOV16ri 16
- renamable $r8w = MOV16ri 16
- MOV16mr %stack.6, 1, $noreg, 0, $noreg, $r8w :: (store (s16) into %stack.6)
- PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.4, align 4)
- renamable $r9 = COPY $rsi
- $rsi = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7)
- renamable $r8 = COPY $rdi
- $di = MOV16rm %stack.6, 1, $noreg, 0, $noreg :: (load (s16) from %stack.6)
- renamable $r10 = COPY $rax
- $ax = MOV16rm %stack.5, 1, $noreg, 0, $noreg :: (load (s16) from %stack.5)
- renamable $tmm4_tmm5 = PT2RPNTLVWZ0V renamable $ax, renamable $cx, renamable $di, renamable $rdx, 1, killed renamable $r10, 0, $noreg
- renamable $tmm0 = COPY renamable $tmm5
- renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5
- PTILESTOREDV renamable $ax, renamable $cx, renamable $r9, 1, renamable $rsi, 0, $noreg, killed renamable $tmm1
- PTILESTOREDV renamable $ax, renamable $di, renamable $r8, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0
- renamable $tmm0 = PTILEZEROV renamable $ax, renamable $cx
- PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0
- renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg
- renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r8, 1, renamable $rsi, 0, $noreg
- renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg
- renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2
- PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0
-...
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir
deleted file mode 100644
index c7d241f..0000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir
+++ /dev/null
@@ -1,153 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
-# RUN: -mattr=+amx-transpose -run-pass=greedy,tileconfig -o - %s | FileCheck %s
-
---- |
- @buf = dso_local global [2048 x i8] zeroinitializer, align 16
- @buf2 = dso_local global [2048 x i8] zeroinitializer, align 16
-
- define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 {
- entry:
- %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, i8* getelementptr inbounds ([2048 x i8], [2048 x i8]* @buf, i64 0, i64 0), i64 32) #5
- %1 = extractvalue { x86_amx, x86_amx } %0, 0
- %2 = extractvalue { x86_amx, x86_amx } %0, 1
- %3 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #5
- %4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %3, x86_amx %1, x86_amx %2) #5
- tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, i8* getelementptr inbounds ([2048 x i8], [2048 x i8]* @buf2, i64 0, i64 0), i64 32, x86_amx %4) #5
- ret void
- }
-
- declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, i8*, i64) #1
-
- declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2
-
- declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3
-
- declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3
-
- declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2
-
- declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) #4
-
- attributes #0 = { nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" }
- attributes #1 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
- attributes #2 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
- attributes #3 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
- attributes #4 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
- attributes #5 = { nounwind }
-
-...
----
-name: test_tile_2rpntlvwz0
-alignment: 16
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-failedISel: false
-tracksRegLiveness: true
-hasWinCFI: false
-callsEHReturn: false
-callsUnwindInit: false
-hasEHContTarget: false
-hasEHScopes: false
-hasEHFunclets: false
-failsVerification: false
-tracksDebugUserValues: false
-registers:
- - { id: 0, class: gr32, preferred-register: '' }
- - { id: 1, class: gr32, preferred-register: '' }
- - { id: 2, class: gr32, preferred-register: '' }
- - { id: 3, class: gr16, preferred-register: '' }
- - { id: 4, class: gr16, preferred-register: '' }
- - { id: 5, class: gr16, preferred-register: '' }
- - { id: 6, class: gr64, preferred-register: '' }
- - { id: 7, class: gr64_nosp, preferred-register: '' }
- - { id: 8, class: tilepair, preferred-register: '' }
- - { id: 9, class: tile, preferred-register: '' }
- - { id: 10, class: tile, preferred-register: '' }
- - { id: 11, class: tile, preferred-register: '' }
- - { id: 12, class: tile, preferred-register: '' }
- - { id: 13, class: gr64, preferred-register: '' }
- - { id: 14, class: vr512, preferred-register: '' }
-liveins:
- - { reg: '$edi', virtual-reg: '%0' }
- - { reg: '$esi', virtual-reg: '%1' }
- - { reg: '$edx', virtual-reg: '%2' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 4
- adjustsStack: false
- hasCalls: false
- stackProtector: ''
- functionContext: ''
- maxCallFrameSize: 4294967295
- cvBytesOfCalleeSavedRegisters: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
- hasTailCall: false
- localFrameSize: 0
- savePoint: []
- restorePoint: []
-fixedStack: []
-stack:
- - { id: 0, name: '', type: default, offset: 0, size: 64, alignment: 4,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-callSites: []
-debugValueSubstitutions: []
-constants: []
-machineFunctionInfo:
- amxProgModel: ManagedRA
-body: |
- bb.0.entry:
- liveins: $edi, $esi, $edx
-
-
- ; CHECK-LABEL: name: test_tile_2rpntlvwz0
- ; CHECK: liveins: $edi, $esi, $edx
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $edi
- ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0
- ; CHECK-NEXT: VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.0, align 4)
- ; CHECK-NEXT: MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4)
- ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 26, $noreg, [[COPY]].sub_16bit :: (store (s512) into %stack.0 + 26, align 2, basealign 4)
- ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 53, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 53, align 1, basealign 4)
- ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 24, $noreg, [[COPY1]].sub_16bit :: (store (s512) into %stack.0 + 24, align 4)
- ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 52, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 52, align 4)
- ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 16, $noreg, [[COPY]].sub_16bit :: (store (s512) into %stack.0 + 16, align 4)
- ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 48, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 48, align 4)
- ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4)
- ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64 = MOV32ri64 @buf
- ; CHECK-NEXT: [[MOV32ri64_1:%[0-9]+]]:gr64_nosp = MOV32ri64 32
- ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[COPY2]].sub_16bit, [[COPY1]].sub_16bit, [[COPY]].sub_16bit, [[MOV32ri64_]], 1, [[MOV32ri64_1]], 0, $noreg
- ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY2]].sub_16bit, [[COPY1]].sub_16bit
- ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY2]].sub_16bit, [[COPY]].sub_16bit, [[COPY1]].sub_16bit, [[PTILEZEROV]], [[PT2RPNTLVWZ0V]].sub_t0, [[PT2RPNTLVWZ0V]].sub_t1
- ; CHECK-NEXT: [[MOV32ri64_2:%[0-9]+]]:gr64 = MOV32ri64 @buf2
- ; CHECK-NEXT: PTILESTOREDV [[COPY2]].sub_16bit, [[COPY1]].sub_16bit, [[MOV32ri64_2]], 1, [[MOV32ri64_1]], 0, $noreg, [[PTILEZEROV]]
- ; CHECK-NEXT: RET 0
- %2:gr32 = COPY $edx
- %1:gr32 = COPY $esi
- %0:gr32 = COPY $edi
- %14:vr512 = AVX512_512_SET0
- VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, %14 :: (store (s512) into %stack.0, align 4)
- MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4)
- PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4)
- %6:gr64 = MOV32ri64 @buf
- %7:gr64_nosp = MOV32ri64 32
- %8:tilepair = PT2RPNTLVWZ0V %0.sub_16bit, %1.sub_16bit, %2.sub_16bit, %6, 1, %7, 0, $noreg
- %12:tile = PTILEZEROV %0.sub_16bit, %1.sub_16bit
- %12:tile = PTDPBSSDV %0.sub_16bit, %2.sub_16bit, %1.sub_16bit, %12, %8.sub_t0, %8.sub_t1
- %13:gr64 = MOV32ri64 @buf2
- PTILESTOREDV %0.sub_16bit, %1.sub_16bit, %13, 1, %7, 0, $noreg, %12
- RET 0
-
-...
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir b/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir
deleted file mode 100644
index 66b15aa..0000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir
+++ /dev/null
@@ -1,97 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
-# RUN: -mattr=+amx-transpose -run-pass=lowertilecopy -o - %s | FileCheck %s
-
----
-name: test_tile_2rpntlvwz0
-alignment: 16
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-failedISel: false
-tracksRegLiveness: true
-hasWinCFI: false
-callsEHReturn: false
-callsUnwindInit: false
-hasEHContTarget: false
-hasEHScopes: false
-hasEHFunclets: false
-failsVerification: false
-tracksDebugUserValues: false
-registers: []
-liveins:
- - { reg: '$edi', virtual-reg: '' }
- - { reg: '$esi', virtual-reg: '' }
- - { reg: '$edx', virtual-reg: '' }
- - { reg: '$cx', virtual-reg: '' }
- - { reg: '$r9', virtual-reg: '' }
- - { reg: '$r10', virtual-reg: '' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 1024
- adjustsStack: false
- hasCalls: true
- stackProtector: ''
- functionContext: ''
- maxCallFrameSize: 4294967295
- cvBytesOfCalleeSavedRegisters: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
- hasTailCall: false
- localFrameSize: 0
- savePoint: []
- restorePoint: []
-fixedStack: []
-stack:
- - { id: 43, name: '', type: default, offset: 0, size: 64, alignment: 4,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- - { id: 68, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-callSites: []
-debugValueSubstitutions: []
-constants: []
-machineFunctionInfo:
- amxProgModel: ManagedRA
-body: |
- bb.0.entry:
- liveins: $edi, $esi, $edx, $cx, $di, $r8w, $r11, $r10, $rbx, $r8, $r9
-
-
- ; CHECK-LABEL: name: test_tile_2rpntlvwz0
- ; CHECK: liveins: $edi, $esi, $edx, $cx, $di, $r8w, $r11, $r10, $rbx, $r8, $r9
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4)
- ; CHECK-NEXT: renamable $tmm4_tmm5 = PT2RPNTLVWZ0V killed renamable $cx, killed renamable $di, killed renamable $r8w, killed renamable $r11, 1, killed renamable $rbx, 0, $noreg
- ; CHECK-NEXT: $rax = MOV64ri 64
- ; CHECK-NEXT: TILESTORED %stack.3, 1, $rax, 0, $noreg, $tmm5 :: (store (s8192) into %stack.3)
- ; CHECK-NEXT: $tmm0 = TILELOADD %stack.3, 1, killed $rax, 0, $noreg :: (load (s8192) from %stack.3)
- ; CHECK-NEXT: $rax = MOV64ri 64
- ; CHECK-NEXT: TILESTORED %stack.2, 1, $rax, 0, $noreg, $tmm4 :: (store (s8192) into %stack.2)
- ; CHECK-NEXT: $tmm1 = TILELOADD %stack.2, 1, killed $rax, 0, $noreg :: (load (s8192) from %stack.2)
- ; CHECK-NEXT: renamable $r8 = MOV32ri64 64
- ; CHECK-NEXT: MOV64mr %stack.1, 1, $noreg, 0, $noreg, $r8 :: (store (s64) into %stack.1)
- ; CHECK-NEXT: renamable $di = MOV16ri 64
- ; CHECK-NEXT: renamable $cx = MOV16ri 16
- ; CHECK-NEXT: PTILESTOREDV renamable $cx, renamable $di, killed renamable $r10, 1, renamable $r8, 0, $noreg, killed renamable $tmm1
- ; CHECK-NEXT: PTILESTOREDV killed renamable $cx, killed renamable $di, killed renamable $r9, 1, renamable $r8, 0, $noreg, killed renamable $tmm0
- PLDTILECFGV %stack.43, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.43, align 4)
- renamable $tmm4_tmm5 = PT2RPNTLVWZ0V killed renamable $cx, killed renamable $di, killed renamable $r8w, killed renamable $r11, 1, killed renamable $rbx, 0, $noreg
- renamable $tmm0 = COPY renamable $tmm5
- renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5
- renamable $r8 = MOV32ri64 64
- MOV64mr %stack.68, 1, $noreg, 0, $noreg, $r8 :: (store (s64) into %stack.68)
- renamable $di = MOV16ri 64
- renamable $cx = MOV16ri 16
- PTILESTOREDV renamable $cx, renamable $di, killed renamable $r10, 1, renamable $r8, 0, $noreg, killed renamable $tmm1
- PTILESTOREDV killed renamable $cx, killed renamable $di, killed renamable $r9, 1, renamable $r8, 0, $noreg, killed renamable $tmm0
-
-...
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll
deleted file mode 100644
index 3549875..0000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll
+++ /dev/null
@@ -1,87 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
- ; RUN: opt --codegen-opt-level=0 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s
- ; RUN: opt --codegen-opt-level=0 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s
-
- @buf = dso_local global [2048 x i8] zeroinitializer, align 16
-
- ; Function Attrs: noinline nounwind optnone uwtable
- define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1, ptr %m) #0 {
-; CHECK-LABEL: @test_tile_2rpntlvwz0(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = udiv i16 [[COL1:%.*]], 4
-; CHECK-NEXT: [[TMP1:%.*]] = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 [[ROW:%.*]], i16 [[COL0:%.*]], i16 [[COL1]], ptr @buf, i64 32) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP1]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[COL0]] to i64
-; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M:%.*]], i64 [[TMP3]], x86_amx [[TMP2]])
-; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP1]], 1
-; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[COL1]] to i64
-; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL1]], ptr [[M]], i64 [[TMP6]], x86_amx [[TMP5]])
-; CHECK-NEXT: [[TMP8:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW]], i16 [[COL0]]) #[[ATTR3]]
-; CHECK-NEXT: [[TMP9:%.*]] = sext i16 [[COL0]] to i64
-; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP9]], x86_amx [[TMP8]])
-; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[COL0]] to i64
-; CHECK-NEXT: [[TMP13:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP11]])
-; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[COL1]] to i64
-; CHECK-NEXT: [[TMP16:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL1]], ptr [[M]], i64 [[TMP14]])
-; CHECK-NEXT: [[TMP17:%.*]] = sext i16 [[COL0]] to i64
-; CHECK-NEXT: [[TMP19:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP0]], i16 [[COL0]], ptr [[M]], i64 [[TMP17]])
-; CHECK-NEXT: [[TMP20:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 [[ROW]], i16 [[COL0]], i16 [[COL1]], x86_amx [[TMP13]], x86_amx [[TMP16]], x86_amx [[TMP19]]) #[[ATTR3]]
-; CHECK-NEXT: [[TMP21:%.*]] = sext i16 [[COL0]] to i64
-; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP21]], x86_amx [[TMP20]])
-; CHECK-NEXT: ret void
-;
- entry:
-
- %0 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr getelementptr inbounds ([2048 x i8], ptr @buf, i64 0, i64 0), i64 32) #7
- %1 = extractvalue { x86_amx, x86_amx } %0, 0
- %2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #7
- store <256 x i32> %2, ptr %m, align 1024
-
- %3 = extractvalue { x86_amx, x86_amx } %0, 1
- %4 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #7
- store <256 x i32> %4, ptr %m, align 1024
-
- %5 = call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #7
- %6 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #7
- store <256 x i32> %6, ptr %m, align 64
-
- %7 = load <256 x i32>, ptr %m, align 64
- %8 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %7) #7
- %9 = load <256 x i32>, ptr %m, align 64
- %10 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %9) #7
- %11 = load <256 x i32>, ptr %m, align 64
- %12 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #7
-
- %13 = call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col0, i16 %col1, x86_amx %8, x86_amx %10, x86_amx %12) #7
- %14 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %13) #7
- store <256 x i32> %14, ptr %m, align 64
-
- ret void
- }
-
- ; Function Attrs: argmemonly nounwind readonly
- declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #2
-
- ; Function Attrs: nounwind readnone
- declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #3
-
- ; Function Attrs: nounwind
- declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #4
-
- ; Function Attrs: nounwind
- declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #4
-
- ; Function Attrs: nounwind readnone
- declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #3
-
- ; Function Attrs: argmemonly nounwind writeonly
- declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #5
-
- attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" }
- attributes #1 = { argmemonly nofree nounwind willreturn writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
- attributes #2 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
- attributes #3 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
- attributes #4 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
- attributes #5 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
- attributes #6 = { argmemonly nofree nounwind willreturn "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
- attributes #7 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll
deleted file mode 100644
index 96966264..0000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll
+++ /dev/null
@@ -1,61 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s
-; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s
-
- @buf = dso_local global [2048 x i8] zeroinitializer, align 16
- @buf2 = dso_local global [2048 x i8] zeroinitializer, align 16
-
- ; Function Attrs: nounwind uwtable
- define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 {
-; CHECK-LABEL: @test_tile_2rpntlvwz0(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 [[ROW:%.*]], i16 [[COL0:%.*]], i16 [[COL1:%.*]], ptr @buf, i64 32) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP0]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP0]], 1
-; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW]], i16 [[COL0]]) #[[ATTR3]]
-; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[ROW]], i16 [[COL1]], i16 [[COL0]], x86_amx [[TMP3]], x86_amx [[TMP1]], x86_amx [[TMP2]]) #[[ATTR3]]
-; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr @buf2, i64 32, x86_amx [[TMP4]]) #[[ATTR3]]
-; CHECK-NEXT: ret void
-;
- entry:
- %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr @buf, i64 32) #5
- %1 = extractvalue { x86_amx, x86_amx } %0, 0
- %2 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #5
- %3 = extractvalue { x86_amx, x86_amx } %0, 1
- %4 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #5
- %5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #5
- %6 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #5
- %7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6) #5
- %8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %2) #5
- %9 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4) #5
- %10 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %7, x86_amx %8, x86_amx %9) #5
- %11 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) #5
- %12 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #5
- tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, ptr @buf2, i64 32, x86_amx %12) #5
- ret void
- }
-
- ; Function Attrs: argmemonly nounwind readonly
- declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #1
-
- ; Function Attrs: nounwind readnone
- declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2
-
- ; Function Attrs: nounwind
- declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3
-
- ; Function Attrs: nounwind
- declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3
-
- ; Function Attrs: nounwind readnone
- declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2
-
- ; Function Attrs: argmemonly nounwind writeonly
- declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #4
-
- attributes #0 = { nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" }
- attributes #1 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
- attributes #2 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
- attributes #3 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
- attributes #4 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
- attributes #5 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir
deleted file mode 100644
index 1e3b242..0000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir
+++ /dev/null
@@ -1,134 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
-# RUN: -mattr=+amx-transpose -run-pass=fastpretileconfig -o - %s | FileCheck %s
-
----
-name: test_tile_2rpntlvwz0
-alignment: 16
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-failedISel: false
-tracksRegLiveness: true
-hasWinCFI: false
-callsEHReturn: false
-callsUnwindInit: false
-hasEHContTarget: false
-hasEHScopes: false
-hasEHFunclets: false
-failsVerification: false
-tracksDebugUserValues: false
-registers:
- - { id: 0, class: gr64_nosp, preferred-register: '' }
- - { id: 1, class: gr16, preferred-register: '' }
- - { id: 2, class: gr16, preferred-register: '' }
- - { id: 3, class: gr16, preferred-register: '' }
- - { id: 4, class: gr64, preferred-register: '' }
- - { id: 5, class: gr64, preferred-register: '' }
- - { id: 6, class: gr64, preferred-register: '' }
- - { id: 7, class: gr64_nosp, preferred-register: '' }
- - { id: 8, class: tilepair, preferred-register: '' }
- - { id: 9, class: tile, preferred-register: '' }
- - { id: 10, class: tile, preferred-register: '' }
- - { id: 11, class: tile, preferred-register: '' }
- - { id: 181, class: tile, preferred-register: '' }
- - { id: 183, class: tile, preferred-register: '' }
- - { id: 185, class: tile, preferred-register: '' }
- - { id: 186, class: tile, preferred-register: '' }
-liveins:
- - { reg: '$edi', virtual-reg: '%0' }
- - { reg: '$esi', virtual-reg: '%1' }
- - { reg: '$edx', virtual-reg: '%2' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 1024
- adjustsStack: false
- hasCalls: true
- stackProtector: ''
- functionContext: ''
- maxCallFrameSize: 4294967295
- cvBytesOfCalleeSavedRegisters: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
- hasTailCall: false
- localFrameSize: 0
- savePoint: []
- restorePoint: []
-fixedStack: []
-stack:
- - { id: 18, name: '', type: default, offset: 0, size: 8, alignment: 8,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- - { id: 19, name: '', type: default, offset: 0, size: 8, alignment: 8,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- - { id: 20, name: '', type: default, offset: 0, size: 8, alignment: 8,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- - { id: 21, name: '', type: default, offset: 0, size: 8,
- alignment: 8, stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-callSites: []
-debugValueSubstitutions: []
-constants: []
-machineFunctionInfo:
- amxProgModel: ManagedRA
-body: |
- bb.0.entry:
- liveins: $rdi, $rsi, $rdx, $rax
-
- ; CHECK-LABEL: name: test_tile_2rpntlvwz0
- ; CHECK: liveins: $rdi, $rsi, $rdx, $rax
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0
- ; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.4, align 4)
- ; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4)
- ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 64
- ; CHECK-NEXT: [[MOV16ri:%[0-9]+]]:gr16 = MOV16ri 64
- ; CHECK-NEXT: [[MOV16ri1:%[0-9]+]]:gr16 = MOV16ri 16
- ; CHECK-NEXT: [[MOV16ri2:%[0-9]+]]:gr16 = MOV16ri 16
- ; CHECK-NEXT: PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load (s512) from %stack.4, align 4)
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rsi
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY $rdx
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64_nosp = COPY $rax
- ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[MOV16ri]], [[MOV16ri1]], [[MOV16ri2]], [[COPY2]], 1, killed [[COPY3]], 0, $noreg
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t1
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t0
- ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri1]], [[COPY]], 1, [[MOV32ri64_]], 0, $noreg, killed [[COPY5]]
- ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri2]], [[COPY1]], 1, [[MOV32ri64_]], 0, $noreg, killed [[COPY4]]
- ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[MOV16ri]], [[MOV16ri1]]
- ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri1]], [[COPY2]], 1, [[MOV32ri64_]], 0, $noreg, killed [[PTILEZEROV]]
- ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri1]], [[COPY]], 1, [[MOV32ri64_]], 0, $noreg
- ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri2]], [[COPY1]], 1, [[MOV32ri64_]], 0, $noreg
- ; CHECK-NEXT: [[PTILELOADDV2:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri1]], [[COPY2]], 1, [[MOV32ri64_]], 0, $noreg
- ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[MOV16ri]], [[MOV16ri1]], [[MOV16ri2]], [[PTILELOADDV]], killed [[PTILELOADDV1]], killed [[PTILELOADDV2]]
- ; CHECK-NEXT: PTILESTOREDV killed [[MOV16ri]], killed [[MOV16ri1]], killed [[COPY2]], 1, killed [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]]
- %0:gr64_nosp = MOV32ri64 64
- %1:gr16 = MOV16ri 64
- %2:gr16 = MOV16ri 16
- %3:gr16 = MOV16ri 16
- %4:gr64 = COPY $rsi
- %5:gr64 = COPY $rdi
- %6:gr64 = COPY $rdx
- %7:gr64_nosp = COPY $rax
- %8:tilepair = PT2RPNTLVWZ0V %1, %2, %3, %6, 1, killed %7, 0, $noreg
- %9:tile = COPY %8.sub_t1
- %10:tile = COPY %8.sub_t0
- PTILESTOREDV %1, %2, %4, 1, %0, 0, $noreg, killed %10
- PTILESTOREDV %1, %3, %5, 1, %0, 0, $noreg, killed %9
- %11:tile = PTILEZEROV %1, %2
- PTILESTOREDV %1, %2, %6, 1, %0, 0, $noreg, killed %11
- %181:tile = PTILELOADDV %1, %2, %4, 1, %0, 0, $noreg
- %183:tile = PTILELOADDV %1, %3, %5, 1, %0, 0, $noreg
- %185:tile = PTILELOADDV %1, %2, %6, 1, %0, 0, $noreg
- %186:tile = PTDPBSSDV %1, %2, %3, %181, killed %183, killed %185
- PTILESTOREDV killed %1, killed %2, killed %6, 1, killed %0, 0, $noreg, killed %186
-...
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir
deleted file mode 100644
index ac2cdb4..0000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir
+++ /dev/null
@@ -1,113 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
-# RUN: -mattr=+amx-transpose -run-pass=tilepreconfig -o - %s | FileCheck %s
-
----
-name: test_tile_2rpntlvwz0
-alignment: 16
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-failedISel: false
-tracksRegLiveness: true
-hasWinCFI: false
-callsEHReturn: false
-callsUnwindInit: false
-hasEHContTarget: false
-hasEHScopes: false
-hasEHFunclets: false
-failsVerification: false
-tracksDebugUserValues: false
-registers:
- - { id: 0, class: gr32, preferred-register: '' }
- - { id: 1, class: gr32, preferred-register: '' }
- - { id: 2, class: gr32, preferred-register: '' }
- - { id: 3, class: gr16, preferred-register: '' }
- - { id: 4, class: gr16, preferred-register: '' }
- - { id: 5, class: gr16, preferred-register: '' }
- - { id: 6, class: gr64, preferred-register: '' }
- - { id: 7, class: gr64_nosp, preferred-register: '' }
- - { id: 8, class: tilepair, preferred-register: '' }
- - { id: 9, class: tile, preferred-register: '' }
- - { id: 10, class: tile, preferred-register: '' }
- - { id: 11, class: tile, preferred-register: '' }
- - { id: 12, class: tile, preferred-register: '' }
- - { id: 13, class: gr64, preferred-register: '' }
-liveins:
- - { reg: '$edi', virtual-reg: '%0' }
- - { reg: '$esi', virtual-reg: '%1' }
- - { reg: '$edx', virtual-reg: '%2' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 1
- adjustsStack: false
- hasCalls: false
- stackProtector: ''
- functionContext: ''
- maxCallFrameSize: 4294967295
- cvBytesOfCalleeSavedRegisters: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
- hasTailCall: false
- localFrameSize: 0
- savePoint: []
- restorePoint: []
-fixedStack: []
-stack: []
-callSites: []
-debugValueSubstitutions: []
-constants: []
-machineFunctionInfo:
- amxProgModel: ManagedRA
-body: |
- bb.0.entry:
- liveins: $edi, $esi, $edx, $rax, $rbx
-
- ; CHECK-LABEL: name: test_tile_2rpntlvwz0
- ; CHECK: liveins: $edi, $esi, $edx, $rax, $rbx
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0
- ; CHECK-NEXT: VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.0, align 4)
- ; CHECK-NEXT: MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4)
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $edi
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr16 = COPY [[COPY1]].sub_16bit
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gr16 = COPY [[COPY2]].sub_16bit
- ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load (s512) from %stack.0, align 4)
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64 = COPY $rax
- ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 32
- ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[COPY5]], [[COPY4]], [[COPY3]], killed [[COPY6]], 1, [[MOV32ri64_]], 0, $noreg
- ; CHECK-NEXT: [[COPY7:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t1
- ; CHECK-NEXT: [[COPY8:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t0
- ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY5]], [[COPY4]]
- ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[COPY5]], [[COPY3]], [[COPY4]], [[PTILEZEROV]], killed [[COPY8]], killed [[COPY7]]
- ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64 = COPY $rbx
- ; CHECK-NEXT: PTILESTOREDV [[COPY5]], [[COPY4]], killed [[COPY9]], 1, [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]]
- ; CHECK-NEXT: RET 0
- %2:gr32 = COPY $edx
- %1:gr32 = COPY $esi
- %0:gr32 = COPY $edi
- %3:gr16 = COPY %2.sub_16bit
- %4:gr16 = COPY %1.sub_16bit
- %5:gr16 = COPY %0.sub_16bit
- %6:gr64 = COPY $rax
- %7:gr64_nosp = MOV32ri64 32
- %8:tilepair = PT2RPNTLVWZ0V %5, %4, %3, killed %6, 1, %7, 0, $noreg
- %9:tile = COPY %8.sub_t1
- %10:tile = COPY %8.sub_t0
- %11:tile = PTILEZEROV %5, %4
- %12:tile = PTDPBSSDV %5, %3, %4, %11, killed %10, killed %9
- %13:gr64 = COPY $rbx
- PTILESTOREDV %5, %4, killed %13, 1, %7, 0, $noreg, killed %12
- RET 0
-
-...
diff --git a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll
deleted file mode 100644
index 4cfd97a..0000000
--- a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll
+++ /dev/null
@@ -1,371 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR
-
-define void @test_amx(i32 %rv32, i64 %stride, i64 %rvalue, i8* %addr1, <4 x float> %xmm) #0 {
-; CHECK-LABEL: test_amx:
-; CHECK: # %bb.0:
-; CHECK-NEXT: t2rpntlvwz0 (%rcx,%rsi), %tmm0
-; CHECK-NEXT: t2rpntlvwz0t1 (%rcx,%rsi), %tmm2
-; CHECK-NEXT: t2rpntlvwz1 (%rcx,%rsi), %tmm0
-; CHECK-NEXT: t2rpntlvwz1t1 (%rcx,%rsi), %tmm2
-; CHECK-NEXT: ttransposed %tmm3, %tmm1
-; CHECK-NEXT: ttdpbf16ps %tmm3, %tmm2, %tmm1
-; CHECK-NEXT: ttdpfp16ps %tmm6, %tmm5, %tmm4
-; CHECK-NEXT: ttcmmimfp16ps %tmm3, %tmm2, %tmm1
-; CHECK-NEXT: ttcmmrlfp16ps %tmm3, %tmm2, %tmm1
-; CHECK-NEXT: tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1
-; CHECK-NEXT: tconjtfp16 %tmm2, %tmm1
-; CHECK-NEXT: retq
-;
-; EGPR-LABEL: test_amx:
-; EGPR: # %bb.0:
-; EGPR-NEXT: t2rpntlvwz0 (%rcx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x04,0x31]
-; EGPR-NEXT: t2rpntlvwz0t1 (%rcx,%rsi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x31]
-; EGPR-NEXT: t2rpntlvwz1 (%rcx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x04,0x31]
-; EGPR-NEXT: t2rpntlvwz1t1 (%rcx,%rsi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x31]
-; EGPR-NEXT: ttransposed %tmm3, %tmm1 # encoding: [0xc4,0xe2,0x7a,0x5f,0xcb]
-; EGPR-NEXT: ttdpbf16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x62,0x6c,0xca]
-; EGPR-NEXT: ttdpfp16ps %tmm6, %tmm5, %tmm4 # encoding: [0xc4,0xe2,0x4b,0x6c,0xe5]
-; EGPR-NEXT: ttcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x63,0x6b,0xca]
-; EGPR-NEXT: ttcmmrlfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x62,0x6b,0xca]
-; EGPR-NEXT: tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x60,0x6b,0xca]
-; EGPR-NEXT: tconjtfp16 %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x79,0x6b,0xca]
-; EGPR-NEXT: retq # encoding: [0xc3]
- call void @llvm.x86.t2rpntlvwz0(i8 1, i8* %addr1, i64 %stride)
- call void @llvm.x86.t2rpntlvwz0t1(i8 2, i8* %addr1, i64 %stride)
- call void @llvm.x86.t2rpntlvwz1(i8 1, i8* %addr1, i64 %stride)
- call void @llvm.x86.t2rpntlvwz1t1(i8 2, i8* %addr1, i64 %stride)
- call void @llvm.x86.ttransposed(i8 1, i8 3)
- call void @llvm.x86.ttdpbf16ps(i8 1, i8 2, i8 3)
- call void @llvm.x86.ttdpfp16ps(i8 4, i8 5, i8 6)
- call void @llvm.x86.ttcmmimfp16ps(i8 1, i8 2, i8 3)
- call void @llvm.x86.ttcmmrlfp16ps(i8 1, i8 2, i8 3)
- call void @llvm.x86.tconjtcmmimfp16ps(i8 1, i8 2, i8 3)
- call void @llvm.x86.tconjtfp16(i8 1, i8 2)
- ret void
-}
-
-declare void @llvm.x86.t2rpntlvwz0(i8 %tile1, i8* %addr1, i64 %stride)
-declare void @llvm.x86.t2rpntlvwz0t1(i8 %tile1, i8* %addr1, i64 %stride)
-declare void @llvm.x86.t2rpntlvwz1(i8 %tile1, i8* %addr1, i64 %stride)
-declare void @llvm.x86.t2rpntlvwz1t1(i8 %tile1, i8* %addr1, i64 %stride)
-declare void @llvm.x86.ttransposed(i8 %tile0, i8 %tile1)
-declare void @llvm.x86.ttdpbf16ps(i8 %tile0, i8 %tile1, i8 %tile2)
-declare void @llvm.x86.ttdpfp16ps(i8 %tile0, i8 %tile1, i8 %tile2)
-declare void @llvm.x86.ttcmmimfp16ps(i8 %A, i8 %B, i8 %C)
-declare void @llvm.x86.ttcmmrlfp16ps(i8 %A, i8 %B, i8 %C)
-declare void @llvm.x86.tconjtcmmimfp16ps(i8 %A, i8 %B, i8 %C)
-declare void @llvm.x86.tconjtfp16(i8 %A, i8 %B)
-
-define void @test_amx2(i8* %pointer, i8* %base, i64 %stride) #0 {
-; CHECK-LABEL: test_amx2:
-; CHECK: # %bb.0:
-; CHECK-NEXT: pushq %rbp
-; CHECK-NEXT: subq $2928, %rsp # imm = 0xB70
-; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, %ax
-; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm0
-; CHECK-NEXT: tilezero %tmm1
-; CHECK-NEXT: tilezero %tmm2
-; CHECK-NEXT: ttdpbf16ps %tmm1, %tmm0, %tmm2
-; CHECK-NEXT: ttdpfp16ps %tmm1, %tmm0, %tmm2
-; CHECK-NEXT: ttcmmimfp16ps %tmm1, %tmm0, %tmm2
-; CHECK-NEXT: ttcmmrlfp16ps %tmm1, %tmm0, %tmm2
-; CHECK-NEXT: movabsq $64, %rbp
-; CHECK-NEXT: tilestored %tmm2, 896(%rsp,%rbp) # 1024-byte Folded Spill
-; CHECK-NEXT: tileloadd 896(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload
-; CHECK-NEXT: tconjtcmmimfp16ps %tmm1, %tmm0, %tmm3
-; CHECK-NEXT: tconjtfp16 %tmm3, %tmm0
-; CHECK-NEXT: tilestored %tmm2, (%rdi,%rdx)
-; CHECK-NEXT: addq $2928, %rsp # imm = 0xB70
-; CHECK-NEXT: popq %rbp
-; CHECK-NEXT: tilerelease
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
-;
-; EGPR-LABEL: test_amx2:
-; EGPR: # %bb.0:
-; EGPR-NEXT: pushq %rbp # encoding: [0x55]
-; EGPR-NEXT: subq $2928, %rsp # encoding: [0x48,0x81,0xec,0x70,0x0b,0x00,0x00]
-; EGPR-NEXT: # imm = 0xB70
-; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
-; EGPR-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0x0d]
-; EGPR-NEXT: movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x40,0x03,0x00,0x00,0x01]
-; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x70,0x03,0x00,0x00,0x08]
-; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x50,0x03,0x00,0x00,0x08,0x00]
-; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x71,0x03,0x00,0x00,0x08]
-; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x52,0x03,0x00,0x00,0x08,0x00]
-; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x72,0x03,0x00,0x00,0x08]
-; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x54,0x03,0x00,0x00,0x08,0x00]
-; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x73,0x03,0x00,0x00,0x08]
-; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x56,0x03,0x00,0x00,0x08,0x00]
-; EGPR-NEXT: ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0x40,0x03,0x00,0x00]
-; EGPR-NEXT: movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00]
-; EGPR-NEXT: tileloadd (%rsi,%rdx), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x04,0x16]
-; EGPR-NEXT: tilezero %tmm1 # encoding: [0xc4,0xe2,0x7b,0x49,0xc8]
-; EGPR-NEXT: tilezero %tmm2 # encoding: [0xc4,0xe2,0x7b,0x49,0xd0]
-; EGPR-NEXT: ttdpbf16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x72,0x6c,0xd0]
-; EGPR-NEXT: ttdpfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x6c,0xd0]
-; EGPR-NEXT: ttcmmimfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x6b,0xd0]
-; EGPR-NEXT: ttcmmrlfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x72,0x6b,0xd0]
-; EGPR-NEXT: movabsq $64, %rbp # encoding: [0x48,0xbd,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
-; EGPR-NEXT: tilestored %tmm2, 896(%rsp,%rbp) # 1024-byte Folded Spill
-; EGPR-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x94,0x2c,0x80,0x03,0x00,0x00]
-; EGPR-NEXT: tileloadd 896(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload
-; EGPR-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x9c,0x2c,0x80,0x03,0x00,0x00]
-; EGPR-NEXT: tconjtcmmimfp16ps %tmm1, %tmm0, %tmm3 # encoding: [0xc4,0xe2,0x70,0x6b,0xd8]
-; EGPR-NEXT: tconjtfp16 %tmm3, %tmm0 # encoding: [0xc4,0xe2,0x79,0x6b,0xc3]
-; EGPR-NEXT: tilestored %tmm2, (%rdi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x14,0x17]
-; EGPR-NEXT: addq $2928, %rsp # encoding: [0x48,0x81,0xc4,0x70,0x0b,0x00,0x00]
-; EGPR-NEXT: # imm = 0xB70
-; EGPR-NEXT: popq %rbp # encoding: [0x5d]
-; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
-; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
-; EGPR-NEXT: retq # encoding: [0xc3]
-
- %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
- %b = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
- %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
- %c1 = call x86_amx @llvm.x86.ttdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b)
- %c2 = call x86_amx @llvm.x86.ttdpfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c1, x86_amx %a, x86_amx %b)
- %c3 = call x86_amx @llvm.x86.ttcmmimfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c2, x86_amx %a, x86_amx %b)
- %c4 = call x86_amx @llvm.x86.ttcmmrlfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c3, x86_amx %a, x86_amx %b)
- %c5 = call x86_amx @llvm.x86.tconjtcmmimfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c4, x86_amx %a, x86_amx %b)
- %c6 = call x86_amx @llvm.x86.tconjtfp16.internal(i16 8, i16 8, x86_amx %c5)
-
- call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c4)
- ret void
-}
-
-define void @test_amx3(i8* %pointer, i8* %base, i64 %stride) #0 {
-; CHECK-LABEL: test_amx3:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: movw $8, %cx
-; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4
-; CHECK-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm4
-; CHECK-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm4
-; CHECK-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm4
-; CHECK-NEXT: ttransposed %tmm4, %tmm0
-; CHECK-NEXT: tilestored %tmm0, (%rdi,%rdx)
-; CHECK-NEXT: tilerelease
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
-;
-; EGPR-LABEL: test_amx3:
-; EGPR: # %bb.0:
-; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
-; EGPR-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0xff]
-; EGPR-NEXT: movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xc0,0x01]
-; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf0,0x08]
-; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd0,0x08,0x00]
-; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf4,0x08]
-; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd8,0x08,0x00]
-; EGPR-NEXT: movb $0, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf5,0x00]
-; EGPR-NEXT: movw $0, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xda,0x00,0x00]
-; EGPR-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0xc0]
-; EGPR-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
-; EGPR-NEXT: movw $8, %cx # encoding: [0x66,0xb9,0x08,0x00]
-; EGPR-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x24,0x16]
-; EGPR-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x24,0x16]
-; EGPR-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x24,0x16]
-; EGPR-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x24,0x16]
-; EGPR-NEXT: ttransposed %tmm4, %tmm0 # encoding: [0xc4,0xe2,0x7a,0x5f,0xc4]
-; EGPR-NEXT: tilestored %tmm0, (%rdi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x04,0x17]
-; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
-; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
-; EGPR-NEXT: retq # encoding: [0xc3]
- %1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
- %2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
- %3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
- %4 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
- %5 = extractvalue { x86_amx, x86_amx } %4, 0
- %6 = call x86_amx @llvm.x86.ttransposed.internal(i16 8, i16 8, x86_amx %5)
- call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %6)
- ret void
-}
-
-define void @test_amx_spill(i8* %pointer, i8* %base, i64 %stride) #0 {
-; CHECK-LABEL: test_amx_spill:
-; CHECK: # %bb.0:
-; CHECK-NEXT: subq $6088, %rsp # imm = 0x17C8
-; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, %ax
-; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm0
-; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4
-; CHECK-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm6
-; CHECK-NEXT: movabsq $64, %rcx
-; CHECK-NEXT: tilestored %tmm6, 4032(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT: tilestored %tmm7, 5056(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm6
-; CHECK-NEXT: tilestored %tmm6, 1984(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT: tilestored %tmm7, 3008(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm6
-; CHECK-NEXT: tilestored %tmm6, -64(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT: tilestored %tmm7, 960(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm6
-; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx)
-; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx)
-; CHECK-NEXT: tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
-; CHECK-NEXT: tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
-; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx)
-; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx)
-; CHECK-NEXT: tileloadd 1984(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
-; CHECK-NEXT: tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
-; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx)
-; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx)
-; CHECK-NEXT: tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
-; CHECK-NEXT: tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
-; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx)
-; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx)
-; CHECK-NEXT: tilestored %tmm6, (%rsi,%rdx)
-; CHECK-NEXT: tilestored %tmm7, (%rsi,%rdx)
-; CHECK-NEXT: addq $6088, %rsp # imm = 0x17C8
-; CHECK-NEXT: tilerelease
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
-;
-; EGPR-LABEL: test_amx_spill:
-; EGPR: # %bb.0:
-; EGPR-NEXT: subq $6088, %rsp # encoding: [0x48,0x81,0xec,0xc8,0x17,0x00,0x00]
-; EGPR-NEXT: # imm = 0x17C8
-; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
-; EGPR-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0xfe]
-; EGPR-NEXT: movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0x80,0x01]
-; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb0,0x08]
-; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x90,0x08,0x00]
-; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb4,0x08]
-; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x98,0x08,0x00]
-; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb5,0x08]
-; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9a,0x08,0x00]
-; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb6,0x08]
-; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9c,0x08,0x00]
-; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb7,0x08]
-; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9e,0x08,0x00]
-; EGPR-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0x80]
-; EGPR-NEXT: movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00]
-; EGPR-NEXT: tileloadd (%rsi,%rdx), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x04,0x16]
-; EGPR-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x24,0x16]
-; EGPR-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x34,0x16]
-; EGPR-NEXT: movabsq $64, %rcx # encoding: [0x48,0xb9,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
-; EGPR-NEXT: tilestored %tmm6, 4032(%rsp,%rcx) # 1024-byte Folded Spill
-; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xb4,0x0c,0xc0,0x0f,0x00,0x00]
-; EGPR-NEXT: tilestored %tmm7, 5056(%rsp,%rcx) # 1024-byte Folded Spill
-; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x13,0x00,0x00]
-; EGPR-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x34,0x16]
-; EGPR-NEXT: tilestored %tmm6, 1984(%rsp,%rcx) # 1024-byte Folded Spill
-; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xb4,0x0c,0xc0,0x07,0x00,0x00]
-; EGPR-NEXT: tilestored %tmm7, 3008(%rsp,%rcx) # 1024-byte Folded Spill
-; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x0b,0x00,0x00]
-; EGPR-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x34,0x16]
-; EGPR-NEXT: tilestored %tmm6, -64(%rsp,%rcx) # 1024-byte Folded Spill
-; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0x74,0x0c,0xc0]
-; EGPR-NEXT: tilestored %tmm7, 960(%rsp,%rcx) # 1024-byte Folded Spill
-; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x03,0x00,0x00]
-; EGPR-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x34,0x16]
-; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16]
-; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16]
-; EGPR-NEXT: tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
-; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xa4,0x0c,0xc0,0x0f,0x00,0x00]
-; EGPR-NEXT: tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
-; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x13,0x00,0x00]
-; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16]
-; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16]
-; EGPR-NEXT: tileloadd 1984(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
-; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xa4,0x0c,0xc0,0x07,0x00,0x00]
-; EGPR-NEXT: tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
-; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x0b,0x00,0x00]
-; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16]
-; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16]
-; EGPR-NEXT: tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
-; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0x64,0x0c,0xc0]
-; EGPR-NEXT: tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
-; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x03,0x00,0x00]
-; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16]
-; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16]
-; EGPR-NEXT: tilestored %tmm6, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x34,0x16]
-; EGPR-NEXT: tilestored %tmm7, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x3c,0x16]
-; EGPR-NEXT: addq $6088, %rsp # encoding: [0x48,0x81,0xc4,0xc8,0x17,0x00,0x00]
-; EGPR-NEXT: # imm = 0x17C8
-; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
-; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
-; EGPR-NEXT: retq # encoding: [0xc3]
- %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
- %b1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
- %b2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
- %b3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
- %b4 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
- %b5 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
- %e11 = extractvalue { x86_amx, x86_amx } %b1, 0
- %e12 = extractvalue { x86_amx, x86_amx } %b1, 1
- %e21 = extractvalue { x86_amx, x86_amx } %b2, 0
- %e22 = extractvalue { x86_amx, x86_amx } %b2, 1
- %e31 = extractvalue { x86_amx, x86_amx } %b3, 0
- %e32 = extractvalue { x86_amx, x86_amx } %b3, 1
- %e41 = extractvalue { x86_amx, x86_amx } %b4, 0
- %e42 = extractvalue { x86_amx, x86_amx } %b4, 1
- %e51 = extractvalue { x86_amx, x86_amx } %b5, 0
- %e52 = extractvalue { x86_amx, x86_amx } %b5, 1
- call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e11)
- call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e12)
- call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e21)
- call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e22)
- call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e31)
- call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e32)
- call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e41)
- call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e42)
- call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e51)
- call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e52)
- ret void
-}
-
-declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
-declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, i8*, i64)
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16, i16, i16, i8*, i64)
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16, i16, i16, i8*, i64)
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16, i16, i16, i8*, i64)
-declare x86_amx @llvm.x86.ttransposed.internal(i16, i16, x86_amx)
-declare x86_amx @llvm.x86.ttdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
-declare x86_amx @llvm.x86.ttdpfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
-declare x86_amx @llvm.x86.ttcmmimfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
-declare x86_amx @llvm.x86.ttcmmrlfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
-declare x86_amx @llvm.x86.tconjtcmmimfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
-declare x86_amx @llvm.x86.tconjtfp16.internal(i16, i16, x86_amx)
-
-attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index 06e7d47..8007d9d 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -203,14 +203,24 @@ define i1 @init_eq_i32(ptr %word, i32 %position, i1 zeroext %value) nounwind {
define i1 @test_ne_i64(ptr %word, i32 %position) nounwind {
; X86-LABEL: test_ne_i64:
; X86: # %bb.0:
+; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: andl $32, %edx
-; X86-NEXT: shrl $3, %edx
-; X86-NEXT: movl (%eax,%edx), %eax
-; X86-NEXT: btl %ecx, %eax
-; X86-NEXT: setb %al
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $1, %edx
+; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: shll %cl, %edx
+; X86-NEXT: testb $32, %cl
+; X86-NEXT: je .LBB5_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: .LBB5_2:
+; X86-NEXT: andl 4(%eax), %esi
+; X86-NEXT: andl (%eax), %edx
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: setne %al
+; X86-NEXT: popl %esi
; X86-NEXT: retl
;
; X64-LABEL: test_ne_i64:
@@ -232,20 +242,38 @@ define i1 @test_ne_i64(ptr %word, i32 %position) nounwind {
define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind {
; X86-LABEL: complement_ne_i64:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: andl $32, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: movl (%ecx,%esi), %edi
-; X86-NEXT: btl %edx, %edi
-; X86-NEXT: setb %al
-; X86-NEXT: btcl %edx, %edi
-; X86-NEXT: movl %edi, (%ecx,%esi)
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $1, %eax
+; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: testb $32, %cl
+; X86-NEXT: je .LBB6_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: .LBB6_2:
+; X86-NEXT: movl (%edx), %ecx
+; X86-NEXT: movl 4(%edx), %edi
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: andl %esi, %ebx
+; X86-NEXT: movl %ecx, %ebp
+; X86-NEXT: andl %eax, %ebp
+; X86-NEXT: xorl %esi, %edi
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: orl %ebx, %ebp
+; X86-NEXT: setne %al
+; X86-NEXT: movl %ecx, (%edx)
+; X86-NEXT: movl %edi, 4(%edx)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: complement_ne_i64:
@@ -272,20 +300,40 @@ define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind {
define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind {
; X86-LABEL: reset_eq_i64:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: andl $32, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: movl (%ecx,%esi), %edi
-; X86-NEXT: btl %edx, %edi
-; X86-NEXT: setae %al
-; X86-NEXT: btrl %edx, %edi
-; X86-NEXT: movl %edi, (%ecx,%esi)
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $1, %esi
+; X86-NEXT: xorl %edi, %edi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: shll %cl, %esi
+; X86-NEXT: testb $32, %cl
+; X86-NEXT: je .LBB7_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: movl %esi, %edi
+; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: .LBB7_2:
+; X86-NEXT: movl (%edx), %eax
+; X86-NEXT: movl 4(%edx), %ecx
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: andl %edi, %ebx
+; X86-NEXT: notl %edi
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: andl %esi, %ebp
+; X86-NEXT: notl %esi
+; X86-NEXT: andl %ecx, %edi
+; X86-NEXT: andl %eax, %esi
+; X86-NEXT: orl %ebx, %ebp
+; X86-NEXT: sete %al
+; X86-NEXT: movl %esi, (%edx)
+; X86-NEXT: movl %edi, 4(%edx)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: reset_eq_i64:
@@ -313,20 +361,38 @@ define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind {
define i1 @set_ne_i64(ptr %word, i32 %position) nounwind {
; X86-LABEL: set_ne_i64:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: andl $32, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: movl (%ecx,%esi), %edi
-; X86-NEXT: btl %edx, %edi
-; X86-NEXT: setb %al
-; X86-NEXT: btsl %edx, %edi
-; X86-NEXT: movl %edi, (%ecx,%esi)
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $1, %eax
+; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: testb $32, %cl
+; X86-NEXT: je .LBB8_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: .LBB8_2:
+; X86-NEXT: movl (%edx), %ecx
+; X86-NEXT: movl 4(%edx), %edi
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: andl %esi, %ebx
+; X86-NEXT: movl %ecx, %ebp
+; X86-NEXT: andl %eax, %ebp
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: orl %ebx, %ebp
+; X86-NEXT: setne %al
+; X86-NEXT: movl %ecx, (%edx)
+; X86-NEXT: movl %edi, 4(%edx)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: set_ne_i64:
@@ -353,47 +419,52 @@ define i1 @set_ne_i64(ptr %word, i32 %position) nounwind {
define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-LABEL: init_eq_i64:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $1, %edx
-; X86-NEXT: xorl %esi, %esi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: xorl %edi, %edi
-; X86-NEXT: shldl %cl, %eax, %edi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $1, %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: shldl %cl, %eax, %edx
; X86-NEXT: shll %cl, %eax
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: xorl %edi, %edi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: shll %cl, %esi
; X86-NEXT: testb $32, %cl
; X86-NEXT: je .LBB9_2
; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl $0, %edx
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl $0, %eax
; X86-NEXT: .LBB9_2:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: notl %esi
-; X86-NEXT: notl %edx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: notl %ebp
; X86-NEXT: je .LBB9_4
; X86-NEXT: # %bb.3:
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: movl %esi, %edi
+; X86-NEXT: xorl %esi, %esi
; X86-NEXT: .LBB9_4:
-; X86-NEXT: andl 4(%ebx), %esi
-; X86-NEXT: orl %edi, %esi
-; X86-NEXT: andl (%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: andl $32, %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: movl (%ebx,%eax), %eax
-; X86-NEXT: btl %ecx, %eax
-; X86-NEXT: setae %al
-; X86-NEXT: movl %esi, 4(%ebx)
-; X86-NEXT: movl %edx, (%ebx)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl 4(%ecx), %ecx
+; X86-NEXT: andl %ecx, %edx
+; X86-NEXT: andl %ecx, %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl (%edi), %ecx
+; X86-NEXT: andl %ecx, %eax
+; X86-NEXT: andl %ecx, %ebp
+; X86-NEXT: orl %esi, %ebp
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movl %ebp, (%edi)
+; X86-NEXT: movl %ebx, 4(%edi)
+; X86-NEXT: sete %al
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; SSE-LABEL: init_eq_i64:
@@ -445,25 +516,101 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
define i1 @test_ne_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: test_ne_i128:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: andl $96, %edx
-; X86-NEXT: shrl $3, %edx
-; X86-NEXT: movl (%eax,%edx), %eax
-; X86-NEXT: btl %ecx, %eax
-; X86-NEXT: setb %al
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $48, %esp
+; X86-NEXT: movzbl 12(%ebp), %ecx
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, (%esp)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %esi
+; X86-NEXT: movl 24(%esp,%esi), %edi
+; X86-NEXT: movl 28(%esp,%esi), %eax
+; X86-NEXT: shldl %cl, %edi, %eax
+; X86-NEXT: movl 16(%esp,%esi), %edx
+; X86-NEXT: movl 20(%esp,%esi), %esi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl 8(%ebp), %ebx
+; X86-NEXT: shll %cl, %edx
+; X86-NEXT: andl 8(%ebx), %edi
+; X86-NEXT: andl (%ebx), %edx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: andl 12(%ebx), %eax
+; X86-NEXT: andl 4(%ebx), %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: setne %al
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; X64-LABEL: test_ne_i128:
-; X64: # %bb.0:
-; X64-NEXT: movl %esi, %eax
-; X64-NEXT: andl $96, %eax
-; X64-NEXT: shrl $3, %eax
-; X64-NEXT: movl (%rdi,%rax), %eax
-; X64-NEXT: btl %esi, %eax
-; X64-NEXT: setb %al
-; X64-NEXT: retq
+; SSE-LABEL: test_ne_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: movl %esi, %ecx
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: xorl %edx, %edx
+; SSE-NEXT: shldq %cl, %rax, %rdx
+; SSE-NEXT: xorl %esi, %esi
+; SSE-NEXT: shlq %cl, %rax
+; SSE-NEXT: testb $64, %cl
+; SSE-NEXT: cmovneq %rax, %rdx
+; SSE-NEXT: cmovneq %rsi, %rax
+; SSE-NEXT: andq 8(%rdi), %rdx
+; SSE-NEXT: andq (%rdi), %rax
+; SSE-NEXT: orq %rdx, %rax
+; SSE-NEXT: setne %al
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test_ne_i128:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movl %esi, %ecx
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: movl $1, %edx
+; AVX2-NEXT: xorl %esi, %esi
+; AVX2-NEXT: shldq %cl, %rdx, %rsi
+; AVX2-NEXT: shlxq %rcx, %rdx, %rdx
+; AVX2-NEXT: testb $64, %cl
+; AVX2-NEXT: cmovneq %rdx, %rsi
+; AVX2-NEXT: cmovneq %rax, %rdx
+; AVX2-NEXT: andq 8(%rdi), %rsi
+; AVX2-NEXT: andq (%rdi), %rdx
+; AVX2-NEXT: orq %rsi, %rdx
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_ne_i128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: movl %esi, %ecx
+; AVX512-NEXT: movl $1, %eax
+; AVX512-NEXT: xorl %edx, %edx
+; AVX512-NEXT: shldq %cl, %rax, %rdx
+; AVX512-NEXT: xorl %esi, %esi
+; AVX512-NEXT: shlxq %rcx, %rax, %rax
+; AVX512-NEXT: testb $64, %cl
+; AVX512-NEXT: cmovneq %rax, %rdx
+; AVX512-NEXT: cmovneq %rsi, %rax
+; AVX512-NEXT: andq 8(%rdi), %rdx
+; AVX512-NEXT: andq (%rdi), %rax
+; AVX512-NEXT: orq %rdx, %rax
+; AVX512-NEXT: setne %al
+; AVX512-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -476,33 +623,124 @@ define i1 @test_ne_i128(ptr %word, i32 %position) nounwind {
define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: complement_ne_i128:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: andl $96, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: movl (%ecx,%esi), %edi
-; X86-NEXT: btl %edx, %edi
-; X86-NEXT: setb %al
-; X86-NEXT: btcl %edx, %edi
-; X86-NEXT: movl %edi, (%ecx,%esi)
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $80, %esp
+; X86-NEXT: movzbl 12(%ebp), %ecx
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 56(%esp,%eax), %esi
+; X86-NEXT: movl 60(%esp,%eax), %edx
+; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%esp,%eax), %edi
+; X86-NEXT: movl 52(%esp,%eax), %ebx
+; X86-NEXT: shldl %cl, %ebx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %ebx
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl 8(%eax), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %esi, %eax
+; X86-NEXT: movl (%ecx), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: andl %edi, %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl 12(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 4(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ebx, %eax
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl %edx, 8(%eax)
+; X86-NEXT: movl %esi, 12(%eax)
+; X86-NEXT: movl %edi, (%eax)
+; X86-NEXT: movl %ebx, 4(%eax)
+; X86-NEXT: setne %al
+; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; X64-LABEL: complement_ne_i128:
-; X64: # %bb.0:
-; X64-NEXT: movl %esi, %ecx
-; X64-NEXT: andl $96, %ecx
-; X64-NEXT: shrl $3, %ecx
-; X64-NEXT: movl (%rdi,%rcx), %edx
-; X64-NEXT: btl %esi, %edx
-; X64-NEXT: setb %al
-; X64-NEXT: btcl %esi, %edx
-; X64-NEXT: movl %edx, (%rdi,%rcx)
-; X64-NEXT: retq
+; SSE-LABEL: complement_ne_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: movl %esi, %ecx
+; SSE-NEXT: movl $1, %edx
+; SSE-NEXT: xorl %esi, %esi
+; SSE-NEXT: shldq %cl, %rdx, %rsi
+; SSE-NEXT: shlq %cl, %rdx
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: testb $64, %cl
+; SSE-NEXT: cmovneq %rdx, %rsi
+; SSE-NEXT: cmovneq %rax, %rdx
+; SSE-NEXT: movq (%rdi), %rax
+; SSE-NEXT: movq 8(%rdi), %rcx
+; SSE-NEXT: movq %rcx, %r8
+; SSE-NEXT: andq %rsi, %r8
+; SSE-NEXT: movq %rax, %r9
+; SSE-NEXT: andq %rdx, %r9
+; SSE-NEXT: xorq %rcx, %rsi
+; SSE-NEXT: xorq %rax, %rdx
+; SSE-NEXT: orq %r8, %r9
+; SSE-NEXT: setne %al
+; SSE-NEXT: movq %rdx, (%rdi)
+; SSE-NEXT: movq %rsi, 8(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: complement_ne_i128:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %esi, %ecx
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movl $1, %edx
+; AVX-NEXT: xorl %esi, %esi
+; AVX-NEXT: shldq %cl, %rdx, %rsi
+; AVX-NEXT: shlxq %rcx, %rdx, %rdx
+; AVX-NEXT: testb $64, %cl
+; AVX-NEXT: cmovneq %rdx, %rsi
+; AVX-NEXT: cmovneq %rax, %rdx
+; AVX-NEXT: movq (%rdi), %rax
+; AVX-NEXT: movq 8(%rdi), %rcx
+; AVX-NEXT: movq %rcx, %r8
+; AVX-NEXT: andq %rsi, %r8
+; AVX-NEXT: movq %rax, %r9
+; AVX-NEXT: andq %rdx, %r9
+; AVX-NEXT: xorq %rcx, %rsi
+; AVX-NEXT: xorq %rax, %rdx
+; AVX-NEXT: orq %r8, %r9
+; AVX-NEXT: setne %al
+; AVX-NEXT: movq %rdx, (%rdi)
+; AVX-NEXT: movq %rsi, 8(%rdi)
+; AVX-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -517,33 +755,124 @@ define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {
define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: reset_eq_i128:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: andl $96, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: movl (%ecx,%esi), %edi
-; X86-NEXT: btl %edx, %edi
-; X86-NEXT: setae %al
-; X86-NEXT: btrl %edx, %edi
-; X86-NEXT: movl %edi, (%ecx,%esi)
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $80, %esp
+; X86-NEXT: movzbl 12(%ebp), %ecx
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 56(%esp,%eax), %edx
+; X86-NEXT: movl 60(%esp,%eax), %esi
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%esp,%eax), %esi
+; X86-NEXT: movl 52(%esp,%eax), %edi
+; X86-NEXT: shldl %cl, %edi, %edx
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: movl 8(%ebp), %ebx
+; X86-NEXT: shll %cl, %esi
+; X86-NEXT: movl 8(%ebx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edx, %eax
+; X86-NEXT: movl (%ebx), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %esi, %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%ebx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: movl 4(%ebx), %ebx
+; X86-NEXT: andl %ebx, %edi
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: notl %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: notl %ecx
+; X86-NEXT: andl %ebx, %ecx
+; X86-NEXT: notl %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl 8(%ebp), %edi
+; X86-NEXT: movl %edx, 8(%edi)
+; X86-NEXT: movl %eax, 12(%edi)
+; X86-NEXT: movl %esi, (%edi)
+; X86-NEXT: movl %ecx, 4(%edi)
+; X86-NEXT: sete %al
+; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; X64-LABEL: reset_eq_i128:
-; X64: # %bb.0:
-; X64-NEXT: movl %esi, %ecx
-; X64-NEXT: andl $96, %ecx
-; X64-NEXT: shrl $3, %ecx
-; X64-NEXT: movl (%rdi,%rcx), %edx
-; X64-NEXT: btl %esi, %edx
-; X64-NEXT: setae %al
-; X64-NEXT: btrl %esi, %edx
-; X64-NEXT: movl %edx, (%rdi,%rcx)
-; X64-NEXT: retq
+; SSE-LABEL: reset_eq_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: movl %esi, %ecx
+; SSE-NEXT: movl $1, %edx
+; SSE-NEXT: xorl %esi, %esi
+; SSE-NEXT: shldq %cl, %rdx, %rsi
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: shlq %cl, %rdx
+; SSE-NEXT: testb $64, %cl
+; SSE-NEXT: cmovneq %rdx, %rsi
+; SSE-NEXT: cmovneq %rax, %rdx
+; SSE-NEXT: movq (%rdi), %rax
+; SSE-NEXT: movq 8(%rdi), %rcx
+; SSE-NEXT: movq %rcx, %r8
+; SSE-NEXT: andq %rsi, %r8
+; SSE-NEXT: notq %rsi
+; SSE-NEXT: movq %rax, %r9
+; SSE-NEXT: andq %rdx, %r9
+; SSE-NEXT: notq %rdx
+; SSE-NEXT: andq %rcx, %rsi
+; SSE-NEXT: andq %rax, %rdx
+; SSE-NEXT: orq %r8, %r9
+; SSE-NEXT: sete %al
+; SSE-NEXT: movq %rdx, (%rdi)
+; SSE-NEXT: movq %rsi, 8(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: reset_eq_i128:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %esi, %ecx
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movl $1, %edx
+; AVX-NEXT: xorl %esi, %esi
+; AVX-NEXT: shldq %cl, %rdx, %rsi
+; AVX-NEXT: shlxq %rcx, %rdx, %rdx
+; AVX-NEXT: testb $64, %cl
+; AVX-NEXT: cmovneq %rdx, %rsi
+; AVX-NEXT: cmovneq %rax, %rdx
+; AVX-NEXT: movq (%rdi), %rax
+; AVX-NEXT: movq 8(%rdi), %rcx
+; AVX-NEXT: andnq %rcx, %rsi, %r8
+; AVX-NEXT: andq %rsi, %rcx
+; AVX-NEXT: andnq %rax, %rdx, %rsi
+; AVX-NEXT: andq %rdx, %rax
+; AVX-NEXT: orq %rcx, %rax
+; AVX-NEXT: sete %al
+; AVX-NEXT: movq %rsi, (%rdi)
+; AVX-NEXT: movq %r8, 8(%rdi)
+; AVX-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -559,33 +888,124 @@ define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {
define i1 @set_ne_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: set_ne_i128:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: andl $96, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: movl (%ecx,%esi), %edi
-; X86-NEXT: btl %edx, %edi
-; X86-NEXT: setb %al
-; X86-NEXT: btsl %edx, %edi
-; X86-NEXT: movl %edi, (%ecx,%esi)
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $80, %esp
+; X86-NEXT: movzbl 12(%ebp), %ecx
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 56(%esp,%eax), %esi
+; X86-NEXT: movl 60(%esp,%eax), %edx
+; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%esp,%eax), %edi
+; X86-NEXT: movl 52(%esp,%eax), %ebx
+; X86-NEXT: shldl %cl, %ebx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %ebx
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl 8(%eax), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %esi, %eax
+; X86-NEXT: movl (%ecx), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: andl %edi, %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl 12(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 4(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ebx, %eax
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl %edx, 8(%eax)
+; X86-NEXT: movl %esi, 12(%eax)
+; X86-NEXT: movl %edi, (%eax)
+; X86-NEXT: movl %ebx, 4(%eax)
+; X86-NEXT: setne %al
+; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; X64-LABEL: set_ne_i128:
-; X64: # %bb.0:
-; X64-NEXT: movl %esi, %ecx
-; X64-NEXT: andl $96, %ecx
-; X64-NEXT: shrl $3, %ecx
-; X64-NEXT: movl (%rdi,%rcx), %edx
-; X64-NEXT: btl %esi, %edx
-; X64-NEXT: setb %al
-; X64-NEXT: btsl %esi, %edx
-; X64-NEXT: movl %edx, (%rdi,%rcx)
-; X64-NEXT: retq
+; SSE-LABEL: set_ne_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: movl %esi, %ecx
+; SSE-NEXT: movl $1, %edx
+; SSE-NEXT: xorl %esi, %esi
+; SSE-NEXT: shldq %cl, %rdx, %rsi
+; SSE-NEXT: shlq %cl, %rdx
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: testb $64, %cl
+; SSE-NEXT: cmovneq %rdx, %rsi
+; SSE-NEXT: cmovneq %rax, %rdx
+; SSE-NEXT: movq (%rdi), %rax
+; SSE-NEXT: movq 8(%rdi), %rcx
+; SSE-NEXT: movq %rcx, %r8
+; SSE-NEXT: andq %rsi, %r8
+; SSE-NEXT: movq %rax, %r9
+; SSE-NEXT: andq %rdx, %r9
+; SSE-NEXT: orq %rcx, %rsi
+; SSE-NEXT: orq %rax, %rdx
+; SSE-NEXT: orq %r8, %r9
+; SSE-NEXT: setne %al
+; SSE-NEXT: movq %rdx, (%rdi)
+; SSE-NEXT: movq %rsi, 8(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: set_ne_i128:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %esi, %ecx
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movl $1, %edx
+; AVX-NEXT: xorl %esi, %esi
+; AVX-NEXT: shldq %cl, %rdx, %rsi
+; AVX-NEXT: shlxq %rcx, %rdx, %rdx
+; AVX-NEXT: testb $64, %cl
+; AVX-NEXT: cmovneq %rdx, %rsi
+; AVX-NEXT: cmovneq %rax, %rdx
+; AVX-NEXT: movq (%rdi), %rax
+; AVX-NEXT: movq 8(%rdi), %rcx
+; AVX-NEXT: movq %rcx, %r8
+; AVX-NEXT: andq %rsi, %r8
+; AVX-NEXT: movq %rax, %r9
+; AVX-NEXT: andq %rdx, %r9
+; AVX-NEXT: orq %rcx, %rsi
+; AVX-NEXT: orq %rax, %rdx
+; AVX-NEXT: orq %r8, %r9
+; AVX-NEXT: setne %al
+; AVX-NEXT: movq %rdx, (%rdi)
+; AVX-NEXT: movq %rsi, 8(%rdi)
+; AVX-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -606,9 +1026,9 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $96, %esp
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movzbl 16(%ebp), %ebx
+; X86-NEXT: subl $128, %esp
+; X86-NEXT: movzbl 12(%ebp), %ecx
+; X86-NEXT: movzbl 16(%ebp), %eax
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -617,29 +1037,25 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $12, %al
-; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %edi
-; X86-NEXT: movl 72(%esp,%edi), %edx
-; X86-NEXT: movl 76(%esp,%edi), %esi
-; X86-NEXT: movzbl %bl, %eax
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 64(%esp,%edi), %ebx
-; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-NEXT: movl 68(%esp,%edi), %ebx
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: shldl %cl, %ebx, %edx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: shrb $3, %dl
+; X86-NEXT: andb $12, %dl
+; X86-NEXT: negb %dl
+; X86-NEXT: movsbl %dl, %esi
+; X86-NEXT: movl 64(%esp,%esi), %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: notl %esi
+; X86-NEXT: movl 68(%esp,%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 72(%esp,%esi), %ebx
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movl 76(%esp,%esi), %edi
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: shldl %cl, %ebx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shll %cl, %edx
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -647,53 +1063,72 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 8(%ebp), %esi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%esi), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ebx, %eax
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%esi), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ebx, %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl 40(%esp,%eax), %edi
-; X86-NEXT: movl 44(%esp,%eax), %esi
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %edi, %esi
; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 12(%ecx), %eax
-; X86-NEXT: orl %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%ecx), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ebx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl 4(%ecx), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ecx, %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: notl %ecx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl 100(%esp,%ecx), %edi
+; X86-NEXT: movl 104(%esp,%ecx), %ecx
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movzbl 12(%ebp), %ecx
+; X86-NEXT: shldl %cl, %edi, %ebx
+; X86-NEXT: orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: notl %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl 108(%esp,%ebx), %ebx
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: orl %ebx, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: notl %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl 36(%esp,%esi), %esi
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: movl 8(%ebp), %edx
-; X86-NEXT: andl 8(%edx), %eax
-; X86-NEXT: orl %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl 32(%esp,%eax), %eax
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl 8(%ebp), %edi
-; X86-NEXT: andl 4(%edi), %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl 96(%esp,%ebx), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %ebx
+; X86-NEXT: orl %ebx, %eax
; X86-NEXT: notl %edx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: andl (%edi), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: andl $96, %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: movl (%edi,%eax), %eax
-; X86-NEXT: btl %ecx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 12(%edi)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 8(%edi)
-; X86-NEXT: movl %ebx, 4(%edi)
-; X86-NEXT: movl %edx, (%edi)
-; X86-NEXT: setae %al
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl %cl, %ebx, %edi
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl 8(%ebp), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, 8(%ecx)
+; X86-NEXT: movl %esi, 12(%ecx)
+; X86-NEXT: movl %eax, (%ecx)
+; X86-NEXT: movl %edx, 4(%ecx)
+; X86-NEXT: sete %al
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
@@ -716,84 +1151,86 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; SSE-NEXT: testb $64, %cl
; SSE-NEXT: cmovneq %rsi, %r8
; SSE-NEXT: cmovneq %r9, %rsi
-; SSE-NEXT: notq %r8
; SSE-NEXT: cmovneq %rax, %rdx
; SSE-NEXT: cmovneq %r9, %rax
+; SSE-NEXT: movq (%rdi), %rcx
+; SSE-NEXT: movq 8(%rdi), %r9
+; SSE-NEXT: movq %r9, %r10
+; SSE-NEXT: andq %r8, %r10
+; SSE-NEXT: notq %r8
+; SSE-NEXT: movq %rcx, %r11
+; SSE-NEXT: andq %rsi, %r11
; SSE-NEXT: notq %rsi
-; SSE-NEXT: andq 8(%rdi), %r8
+; SSE-NEXT: andq %r9, %r8
; SSE-NEXT: orq %rdx, %r8
-; SSE-NEXT: andq (%rdi), %rsi
+; SSE-NEXT: andq %rcx, %rsi
; SSE-NEXT: orq %rax, %rsi
-; SSE-NEXT: movl %ecx, %eax
-; SSE-NEXT: andl $96, %eax
-; SSE-NEXT: shrl $3, %eax
-; SSE-NEXT: movl (%rdi,%rax), %eax
-; SSE-NEXT: btl %ecx, %eax
-; SSE-NEXT: setae %al
-; SSE-NEXT: movq %r8, 8(%rdi)
+; SSE-NEXT: orq %r10, %r11
+; SSE-NEXT: sete %al
; SSE-NEXT: movq %rsi, (%rdi)
+; SSE-NEXT: movq %r8, 8(%rdi)
; SSE-NEXT: retq
;
; AVX2-LABEL: init_eq_i128:
; AVX2: # %bb.0:
; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: movl $1, %eax
-; AVX2-NEXT: xorl %esi, %esi
-; AVX2-NEXT: shldq %cl, %rax, %rsi
-; AVX2-NEXT: movl %edx, %edx
+; AVX2-NEXT: movl $1, %esi
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: shldq %cl, %rsi, %rax
; AVX2-NEXT: xorl %r8d, %r8d
-; AVX2-NEXT: shldq %cl, %rdx, %r8
+; AVX2-NEXT: movl %edx, %edx
; AVX2-NEXT: xorl %r9d, %r9d
-; AVX2-NEXT: shlxq %rcx, %rax, %rax
+; AVX2-NEXT: shldq %cl, %rdx, %r9
+; AVX2-NEXT: shlxq %rcx, %rsi, %rsi
; AVX2-NEXT: testb $64, %cl
-; AVX2-NEXT: cmovneq %rax, %rsi
-; AVX2-NEXT: cmovneq %r9, %rax
-; AVX2-NEXT: shlxq %rcx, %rdx, %rdx
-; AVX2-NEXT: cmovneq %rdx, %r8
-; AVX2-NEXT: cmovneq %r9, %rdx
-; AVX2-NEXT: andnq 8(%rdi), %rsi, %rsi
-; AVX2-NEXT: orq %r8, %rsi
-; AVX2-NEXT: andnq (%rdi), %rax, %r8
-; AVX2-NEXT: orq %rdx, %r8
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $96, %eax
-; AVX2-NEXT: shrl $3, %eax
-; AVX2-NEXT: movl (%rdi,%rax), %eax
-; AVX2-NEXT: btl %ecx, %eax
-; AVX2-NEXT: setae %al
-; AVX2-NEXT: movq %rsi, 8(%rdi)
-; AVX2-NEXT: movq %r8, (%rdi)
+; AVX2-NEXT: cmovneq %rsi, %rax
+; AVX2-NEXT: cmovneq %r8, %rsi
+; AVX2-NEXT: shlxq %rcx, %rdx, %rcx
+; AVX2-NEXT: cmovneq %rcx, %r9
+; AVX2-NEXT: cmovneq %r8, %rcx
+; AVX2-NEXT: movq (%rdi), %rdx
+; AVX2-NEXT: movq 8(%rdi), %r8
+; AVX2-NEXT: andnq %r8, %rax, %r10
+; AVX2-NEXT: andq %rax, %r8
+; AVX2-NEXT: andnq %rdx, %rsi, %r11
+; AVX2-NEXT: andq %rsi, %rdx
+; AVX2-NEXT: orq %r9, %r10
+; AVX2-NEXT: orq %rcx, %r11
+; AVX2-NEXT: orq %r8, %rdx
+; AVX2-NEXT: sete %al
+; AVX2-NEXT: movq %r11, (%rdi)
+; AVX2-NEXT: movq %r10, 8(%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: init_eq_i128:
; AVX512: # %bb.0:
; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: movl $1, %eax
-; AVX512-NEXT: xorl %esi, %esi
-; AVX512-NEXT: shldq %cl, %rax, %rsi
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movl $1, %esi
; AVX512-NEXT: xorl %r8d, %r8d
-; AVX512-NEXT: shlxq %rcx, %rax, %rax
+; AVX512-NEXT: shldq %cl, %rsi, %r8
+; AVX512-NEXT: shlxq %rcx, %rsi, %rsi
; AVX512-NEXT: movl %edx, %edx
; AVX512-NEXT: xorl %r9d, %r9d
; AVX512-NEXT: shldq %cl, %rdx, %r9
; AVX512-NEXT: testb $64, %cl
+; AVX512-NEXT: cmovneq %rsi, %r8
; AVX512-NEXT: cmovneq %rax, %rsi
-; AVX512-NEXT: cmovneq %r8, %rax
-; AVX512-NEXT: shlxq %rcx, %rdx, %rdx
-; AVX512-NEXT: cmovneq %rdx, %r9
-; AVX512-NEXT: cmovneq %r8, %rdx
-; AVX512-NEXT: andnq 8(%rdi), %rsi, %rsi
-; AVX512-NEXT: orq %r9, %rsi
-; AVX512-NEXT: andnq (%rdi), %rax, %r8
-; AVX512-NEXT: orq %rdx, %r8
-; AVX512-NEXT: movl %ecx, %eax
-; AVX512-NEXT: andl $96, %eax
-; AVX512-NEXT: shrl $3, %eax
-; AVX512-NEXT: movl (%rdi,%rax), %eax
-; AVX512-NEXT: btl %ecx, %eax
-; AVX512-NEXT: setae %al
-; AVX512-NEXT: movq %rsi, 8(%rdi)
+; AVX512-NEXT: shlxq %rcx, %rdx, %rcx
+; AVX512-NEXT: cmovneq %rcx, %r9
+; AVX512-NEXT: cmovneq %rax, %rcx
+; AVX512-NEXT: movq (%rdi), %rax
+; AVX512-NEXT: movq 8(%rdi), %rdx
+; AVX512-NEXT: andnq %rdx, %r8, %r10
+; AVX512-NEXT: andq %r8, %rdx
+; AVX512-NEXT: andnq %rax, %rsi, %r8
+; AVX512-NEXT: andq %rsi, %rax
+; AVX512-NEXT: orq %r9, %r10
+; AVX512-NEXT: orq %rcx, %r8
+; AVX512-NEXT: orq %rdx, %rax
+; AVX512-NEXT: sete %al
; AVX512-NEXT: movq %r8, (%rdi)
+; AVX512-NEXT: movq %r10, 8(%rdi)
; AVX512-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
@@ -815,25 +1252,344 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
define i1 @test_ne_i512(ptr %word, i32 %position) nounwind {
; X86-LABEL: test_ne_i512:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: shrl $3, %edx
-; X86-NEXT: andl $60, %edx
-; X86-NEXT: movl (%eax,%edx), %eax
-; X86-NEXT: btl %ecx, %eax
-; X86-NEXT: setb %al
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $224, %esp
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrl $3, %eax
+; X86-NEXT: andl $60, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: leal {{[0-9]+}}(%esp), %edx
+; X86-NEXT: subl %eax, %edx
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 24(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $31, %ecx
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 56(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 60(%edx), %eax
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%edx), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%edx), %edi
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%edx), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl 52(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl 4(%edx), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl %cl, %edi, %eax
+; X86-NEXT: movl 8(%ebp), %ebx
+; X86-NEXT: andl 40(%ebx), %eax
+; X86-NEXT: andl 8(%ebx), %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 56(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: andl 24(%ebx), %edi
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: andl 44(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl 12(%ebx), %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl 60(%edi), %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 28(%edi), %eax
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%edx), %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: negl %edx
+; X86-NEXT: movl 192(%esp,%edx), %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl %cl, %ebx, %edx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl 8(%ebp), %ebx
+; X86-NEXT: andl 32(%ebx), %ecx
+; X86-NEXT: andl (%ebx), %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: andl 16(%ebx), %edi
+; X86-NEXT: andl 48(%ebx), %edx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 36(%ebx), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl 4(%ebx), %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 20(%ebx), %ecx
+; X86-NEXT: andl 52(%ebx), %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: setne %al
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; X64-LABEL: test_ne_i512:
-; X64: # %bb.0:
-; X64-NEXT: movl %esi, %eax
-; X64-NEXT: shrl $3, %eax
-; X64-NEXT: andl $60, %eax
-; X64-NEXT: movl (%rdi,%rax), %eax
-; X64-NEXT: btl %esi, %eax
-; X64-NEXT: setb %al
-; X64-NEXT: retq
+; SSE-LABEL: test_ne_i512:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %r15
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl %esi, %ecx
+; SSE-NEXT: andl $63, %ecx
+; SSE-NEXT: shrl $3, %esi
+; SSE-NEXT: andl $56, %esi
+; SSE-NEXT: negl %esi
+; SSE-NEXT: movslq %esi, %rbx
+; SSE-NEXT: movq -48(%rsp,%rbx), %rdx
+; SSE-NEXT: movq -40(%rsp,%rbx), %r14
+; SSE-NEXT: movq %r14, %rax
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq -16(%rsp,%rbx), %r11
+; SSE-NEXT: movq -8(%rsp,%rbx), %r10
+; SSE-NEXT: shldq %cl, %r11, %r10
+; SSE-NEXT: movq -32(%rsp,%rbx), %r9
+; SSE-NEXT: movq -24(%rsp,%rbx), %r15
+; SSE-NEXT: movq %r15, %r8
+; SSE-NEXT: shldq %cl, %r9, %r8
+; SSE-NEXT: movq -56(%rsp,%rbx), %rsi
+; SSE-NEXT: shldq %cl, %rsi, %rdx
+; SSE-NEXT: shldq %cl, %r15, %r11
+; SSE-NEXT: shldq %cl, %r14, %r9
+; SSE-NEXT: movq -64(%rsp,%rbx), %rbx
+; SSE-NEXT: shldq %cl, %rbx, %rsi
+; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT: shlq %cl, %rbx
+; SSE-NEXT: andq 32(%rdi), %r9
+; SSE-NEXT: andq 48(%rdi), %r11
+; SSE-NEXT: andq 16(%rdi), %rdx
+; SSE-NEXT: orq %r11, %rdx
+; SSE-NEXT: andq 40(%rdi), %r8
+; SSE-NEXT: andq 56(%rdi), %r10
+; SSE-NEXT: andq 24(%rdi), %rax
+; SSE-NEXT: orq %r10, %rax
+; SSE-NEXT: andq (%rdi), %rbx
+; SSE-NEXT: orq %r9, %rbx
+; SSE-NEXT: orq %rdx, %rbx
+; SSE-NEXT: andq 8(%rdi), %rsi
+; SSE-NEXT: orq %r8, %rsi
+; SSE-NEXT: orq %rax, %rsi
+; SSE-NEXT: orq %rbx, %rsi
+; SSE-NEXT: setne %al
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %r15
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test_ne_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl %esi, %ecx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: shrl $3, %esi
+; AVX2-NEXT: andl $56, %esi
+; AVX2-NEXT: negl %esi
+; AVX2-NEXT: movslq %esi, %rsi
+; AVX2-NEXT: movq -48(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq -40(%rsp,%rsi), %rbx
+; AVX2-NEXT: movq %rbx, %rax
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq -16(%rsp,%rsi), %r11
+; AVX2-NEXT: movq -8(%rsp,%rsi), %r10
+; AVX2-NEXT: shldq %cl, %r11, %r10
+; AVX2-NEXT: movq -32(%rsp,%rsi), %r9
+; AVX2-NEXT: movq -24(%rsp,%rsi), %r14
+; AVX2-NEXT: movq %r14, %r8
+; AVX2-NEXT: shldq %cl, %r9, %r8
+; AVX2-NEXT: movq -64(%rsp,%rsi), %r15
+; AVX2-NEXT: movq -56(%rsp,%rsi), %rsi
+; AVX2-NEXT: shldq %cl, %rsi, %rdx
+; AVX2-NEXT: shldq %cl, %r14, %r11
+; AVX2-NEXT: shldq %cl, %rbx, %r9
+; AVX2-NEXT: shldq %cl, %r15, %rsi
+; AVX2-NEXT: shlxq %rcx, %r15, %rcx
+; AVX2-NEXT: andq 32(%rdi), %r9
+; AVX2-NEXT: andq 48(%rdi), %r11
+; AVX2-NEXT: andq 16(%rdi), %rdx
+; AVX2-NEXT: andq 40(%rdi), %r8
+; AVX2-NEXT: andq 56(%rdi), %r10
+; AVX2-NEXT: andq 24(%rdi), %rax
+; AVX2-NEXT: orq %r11, %rdx
+; AVX2-NEXT: orq %r10, %rax
+; AVX2-NEXT: andq (%rdi), %rcx
+; AVX2-NEXT: orq %r9, %rcx
+; AVX2-NEXT: orq %rdx, %rcx
+; AVX2-NEXT: andq 8(%rdi), %rsi
+; AVX2-NEXT: orq %r8, %rsi
+; AVX2-NEXT: orq %rax, %rsi
+; AVX2-NEXT: orq %rcx, %rsi
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_ne_i512:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %r15
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movl %esi, %ecx
+; AVX512-NEXT: andl $63, %ecx
+; AVX512-NEXT: shrl $3, %esi
+; AVX512-NEXT: andl $56, %esi
+; AVX512-NEXT: negl %esi
+; AVX512-NEXT: movslq %esi, %rbx
+; AVX512-NEXT: movq -48(%rsp,%rbx), %rdx
+; AVX512-NEXT: movq -40(%rsp,%rbx), %r14
+; AVX512-NEXT: movq %r14, %rax
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq -16(%rsp,%rbx), %r11
+; AVX512-NEXT: movq -8(%rsp,%rbx), %r10
+; AVX512-NEXT: shldq %cl, %r11, %r10
+; AVX512-NEXT: movq -32(%rsp,%rbx), %r9
+; AVX512-NEXT: movq -24(%rsp,%rbx), %r15
+; AVX512-NEXT: movq %r15, %r8
+; AVX512-NEXT: shldq %cl, %r9, %r8
+; AVX512-NEXT: movq -56(%rsp,%rbx), %rsi
+; AVX512-NEXT: shldq %cl, %rsi, %rdx
+; AVX512-NEXT: shldq %cl, %r15, %r11
+; AVX512-NEXT: shldq %cl, %r14, %r9
+; AVX512-NEXT: movq -64(%rsp,%rbx), %rbx
+; AVX512-NEXT: shldq %cl, %rbx, %rsi
+; AVX512-NEXT: shlxq %rcx, %rbx, %rcx
+; AVX512-NEXT: andq 32(%rdi), %r9
+; AVX512-NEXT: andq 48(%rdi), %r11
+; AVX512-NEXT: andq 16(%rdi), %rdx
+; AVX512-NEXT: andq 40(%rdi), %r8
+; AVX512-NEXT: andq 56(%rdi), %r10
+; AVX512-NEXT: andq 24(%rdi), %rax
+; AVX512-NEXT: orq %r11, %rdx
+; AVX512-NEXT: orq %r10, %rax
+; AVX512-NEXT: andq (%rdi), %rcx
+; AVX512-NEXT: orq %r9, %rcx
+; AVX512-NEXT: orq %rdx, %rcx
+; AVX512-NEXT: andq 8(%rdi), %rsi
+; AVX512-NEXT: orq %r8, %rsi
+; AVX512-NEXT: orq %rax, %rsi
+; AVX512-NEXT: orq %rcx, %rsi
+; AVX512-NEXT: setne %al
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %r15
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%rem = and i32 %position, 511
%ofs = zext nneg i32 %rem to i512
%bit = shl nuw i512 1, %ofs
@@ -846,33 +1602,572 @@ define i1 @test_ne_i512(ptr %word, i32 %position) nounwind {
define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {
; X86-LABEL: complement_ne_i512:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: andl $60, %esi
-; X86-NEXT: movl (%ecx,%esi), %edi
-; X86-NEXT: btl %edx, %edi
-; X86-NEXT: setb %al
-; X86-NEXT: btcl %edx, %edi
-; X86-NEXT: movl %edi, (%ecx,%esi)
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $272, %esp # imm = 0x110
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrl $3, %eax
+; X86-NEXT: andl $60, %eax
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: leal {{[0-9]+}}(%esp), %edx
+; X86-NEXT: subl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 24(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $31, %ecx
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 56(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 60(%edx), %esi
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%edx), %ebx
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%edx), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl 52(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 4(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%ebp), %edx
+; X86-NEXT: movl 40(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %esi, %eax
+; X86-NEXT: movl 8(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ebx, %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movl 56(%edx), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edi, %ebx
+; X86-NEXT: movl 24(%edx), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%eax), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl 12(%eax), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl 60(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl 28(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl (%eax), %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: negl %eax
+; X86-NEXT: movl 240(%esp,%eax), %esi
+; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%ebp), %esi
+; X86-NEXT: movl 32(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edi, %eax
+; X86-NEXT: movl (%esi), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edx, %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl 16(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ebx, %eax
+; X86-NEXT: movl 48(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 36(%esi), %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl 4(%esi), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl 20(%esi), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl %esi, %edi
+; X86-NEXT: movl 52(%eax), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: xorl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl %ecx, (%esp) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movl 8(%ebp), %edx
+; X86-NEXT: movl %ebx, 60(%edx)
+; X86-NEXT: movl %edi, 56(%edx)
+; X86-NEXT: movl %ecx, 52(%edx)
+; X86-NEXT: movl %esi, 44(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 40(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 36(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 32(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 28(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 24(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 20(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 16(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 12(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 8(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 4(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, (%edx)
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 48(%edx)
+; X86-NEXT: setne %al
+; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; X64-LABEL: complement_ne_i512:
-; X64: # %bb.0:
-; X64-NEXT: movl %esi, %ecx
-; X64-NEXT: shrl $3, %ecx
-; X64-NEXT: andl $60, %ecx
-; X64-NEXT: movl (%rdi,%rcx), %edx
-; X64-NEXT: btl %esi, %edx
-; X64-NEXT: setb %al
-; X64-NEXT: btcl %esi, %edx
-; X64-NEXT: movl %edx, (%rdi,%rcx)
-; X64-NEXT: retq
+; SSE-LABEL: complement_ne_i512:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: pushq %r15
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %r13
+; SSE-NEXT: pushq %r12
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: subq $56, %rsp
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl %esi, %ecx
+; SSE-NEXT: andl $63, %ecx
+; SSE-NEXT: shrl $3, %esi
+; SSE-NEXT: andl $56, %esi
+; SSE-NEXT: negl %esi
+; SSE-NEXT: movslq %esi, %rbx
+; SSE-NEXT: movq (%rsp,%rbx), %rsi
+; SSE-NEXT: movq 8(%rsp,%rbx), %r14
+; SSE-NEXT: movq %r14, %rax
+; SSE-NEXT: shldq %cl, %rsi, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 32(%rsp,%rbx), %r8
+; SSE-NEXT: movq 40(%rsp,%rbx), %rbp
+; SSE-NEXT: shldq %cl, %r8, %rbp
+; SSE-NEXT: movq 16(%rsp,%rbx), %r9
+; SSE-NEXT: movq 24(%rsp,%rbx), %r15
+; SSE-NEXT: movq %r15, %r10
+; SSE-NEXT: shldq %cl, %r9, %r10
+; SSE-NEXT: movq -8(%rsp,%rbx), %r11
+; SSE-NEXT: shldq %cl, %r11, %rsi
+; SSE-NEXT: shldq %cl, %r15, %r8
+; SSE-NEXT: shldq %cl, %r14, %r9
+; SSE-NEXT: movq -16(%rsp,%rbx), %rbx
+; SSE-NEXT: shldq %cl, %rbx, %r11
+; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT: shlq %cl, %rbx
+; SSE-NEXT: movq 24(%rdi), %r15
+; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 56(%rdi), %rcx
+; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 16(%rdi), %r12
+; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 48(%rdi), %r13
+; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq %r8, %r13
+; SSE-NEXT: andq %rsi, %r12
+; SSE-NEXT: orq %r13, %r12
+; SSE-NEXT: movq %rcx, %r13
+; SSE-NEXT: andq %rbp, %r13
+; SSE-NEXT: andq %rax, %r15
+; SSE-NEXT: orq %r13, %r15
+; SSE-NEXT: movq 32(%rdi), %r14
+; SSE-NEXT: movq %r14, %rcx
+; SSE-NEXT: andq %r9, %rcx
+; SSE-NEXT: movq (%rdi), %r13
+; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq %rbx, %r13
+; SSE-NEXT: orq %rcx, %r13
+; SSE-NEXT: orq %r12, %r13
+; SSE-NEXT: movq 40(%rdi), %rcx
+; SSE-NEXT: movq %rcx, %r12
+; SSE-NEXT: andq %r10, %r12
+; SSE-NEXT: movq 8(%rdi), %rdx
+; SSE-NEXT: movq %rdx, %rax
+; SSE-NEXT: andq %r11, %rax
+; SSE-NEXT: orq %r12, %rax
+; SSE-NEXT: orq %r15, %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; SSE-NEXT: xorq %rcx, %r10
+; SSE-NEXT: xorq %r14, %r9
+; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; SSE-NEXT: xorq %rdx, %r11
+; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; SSE-NEXT: orq %r13, %rax
+; SSE-NEXT: movq %r8, 48(%rdi)
+; SSE-NEXT: movq %rbp, 56(%rdi)
+; SSE-NEXT: movq %r9, 32(%rdi)
+; SSE-NEXT: movq %r10, 40(%rdi)
+; SSE-NEXT: movq %rsi, 16(%rdi)
+; SSE-NEXT: movq %r15, 24(%rdi)
+; SSE-NEXT: movq %rbx, (%rdi)
+; SSE-NEXT: movq %r11, 8(%rdi)
+; SSE-NEXT: setne %al
+; SSE-NEXT: addq $56, %rsp
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r12
+; SSE-NEXT: popq %r13
+; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %r15
+; SSE-NEXT: popq %rbp
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: complement_ne_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: subq $72, %rsp
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
+; AVX2-NEXT: vmovups %ymm0, (%rsp)
+; AVX2-NEXT: movl %esi, %ecx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: shrl $3, %esi
+; AVX2-NEXT: andl $56, %esi
+; AVX2-NEXT: negl %esi
+; AVX2-NEXT: movslq %esi, %rbx
+; AVX2-NEXT: movq 16(%rsp,%rbx), %rsi
+; AVX2-NEXT: movq 24(%rsp,%rbx), %rbp
+; AVX2-NEXT: movq %rbp, %rax
+; AVX2-NEXT: shldq %cl, %rsi, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 48(%rsp,%rbx), %r8
+; AVX2-NEXT: movq 56(%rsp,%rbx), %r13
+; AVX2-NEXT: shldq %cl, %r8, %r13
+; AVX2-NEXT: movq 32(%rsp,%rbx), %r9
+; AVX2-NEXT: movq 40(%rsp,%rbx), %r14
+; AVX2-NEXT: movq %r14, %r10
+; AVX2-NEXT: shldq %cl, %r9, %r10
+; AVX2-NEXT: movq 8(%rsp,%rbx), %r11
+; AVX2-NEXT: shldq %cl, %r11, %rsi
+; AVX2-NEXT: shldq %cl, %r14, %r8
+; AVX2-NEXT: movq 16(%rdi), %r12
+; AVX2-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 48(%rdi), %r14
+; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: andq %r8, %r14
+; AVX2-NEXT: andq %rsi, %r12
+; AVX2-NEXT: orq %r14, %r12
+; AVX2-NEXT: movq 56(%rdi), %r15
+; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: andq %r13, %r15
+; AVX2-NEXT: movq 24(%rdi), %r14
+; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: andq %rax, %r14
+; AVX2-NEXT: orq %r15, %r14
+; AVX2-NEXT: shldq %cl, %rbp, %r9
+; AVX2-NEXT: movq (%rsp,%rbx), %rdx
+; AVX2-NEXT: movq 32(%rdi), %r15
+; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: andq %r9, %r15
+; AVX2-NEXT: shlxq %rcx, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq (%rdi), %rbx
+; AVX2-NEXT: movq %rbx, %rbp
+; AVX2-NEXT: andq %rax, %rbp
+; AVX2-NEXT: orq %r15, %rbp
+; AVX2-NEXT: orq %r12, %rbp
+; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
+; AVX2-NEXT: shldq %cl, %rdx, %r11
+; AVX2-NEXT: movq 40(%rdi), %rax
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: andq %r10, %rcx
+; AVX2-NEXT: movq 8(%rdi), %r15
+; AVX2-NEXT: movq %r15, %r12
+; AVX2-NEXT: andq %r11, %r12
+; AVX2-NEXT: orq %rcx, %r12
+; AVX2-NEXT: orq %r14, %r12
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; AVX2-NEXT: xorq %rax, %r10
+; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; AVX2-NEXT: xorq %r15, %r11
+; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX2-NEXT: orq %rbp, %r12
+; AVX2-NEXT: movq %r8, 48(%rdi)
+; AVX2-NEXT: movq %r13, 56(%rdi)
+; AVX2-NEXT: movq %r9, 32(%rdi)
+; AVX2-NEXT: movq %r10, 40(%rdi)
+; AVX2-NEXT: movq %rsi, 16(%rdi)
+; AVX2-NEXT: movq %rcx, 24(%rdi)
+; AVX2-NEXT: movq %rbx, (%rdi)
+; AVX2-NEXT: movq %r11, 8(%rdi)
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: addq $72, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: complement_ne_i512:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: pushq %r15
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: pushq %r13
+; AVX512-NEXT: pushq %r12
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: subq $72, %rsp
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
+; AVX512-NEXT: vmovups %ymm0, (%rsp)
+; AVX512-NEXT: movl %esi, %ecx
+; AVX512-NEXT: andl $63, %ecx
+; AVX512-NEXT: shrl $3, %esi
+; AVX512-NEXT: andl $56, %esi
+; AVX512-NEXT: negl %esi
+; AVX512-NEXT: movslq %esi, %rbx
+; AVX512-NEXT: movq 16(%rsp,%rbx), %rsi
+; AVX512-NEXT: movq 24(%rsp,%rbx), %rbp
+; AVX512-NEXT: movq %rbp, %rax
+; AVX512-NEXT: shldq %cl, %rsi, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 48(%rsp,%rbx), %r8
+; AVX512-NEXT: movq 56(%rsp,%rbx), %r13
+; AVX512-NEXT: shldq %cl, %r8, %r13
+; AVX512-NEXT: movq 32(%rsp,%rbx), %r9
+; AVX512-NEXT: movq 40(%rsp,%rbx), %r14
+; AVX512-NEXT: movq %r14, %r10
+; AVX512-NEXT: shldq %cl, %r9, %r10
+; AVX512-NEXT: movq 8(%rsp,%rbx), %r11
+; AVX512-NEXT: shldq %cl, %r11, %rsi
+; AVX512-NEXT: shldq %cl, %r14, %r8
+; AVX512-NEXT: movq 16(%rdi), %r12
+; AVX512-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 48(%rdi), %r14
+; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: andq %r8, %r14
+; AVX512-NEXT: andq %rsi, %r12
+; AVX512-NEXT: orq %r14, %r12
+; AVX512-NEXT: movq 56(%rdi), %r15
+; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: andq %r13, %r15
+; AVX512-NEXT: movq 24(%rdi), %r14
+; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: andq %rax, %r14
+; AVX512-NEXT: orq %r15, %r14
+; AVX512-NEXT: shldq %cl, %rbp, %r9
+; AVX512-NEXT: movq (%rsp,%rbx), %rdx
+; AVX512-NEXT: movq 32(%rdi), %r15
+; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: andq %r9, %r15
+; AVX512-NEXT: shlxq %rcx, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq (%rdi), %rbx
+; AVX512-NEXT: movq %rbx, %rbp
+; AVX512-NEXT: andq %rax, %rbp
+; AVX512-NEXT: orq %r15, %rbp
+; AVX512-NEXT: orq %r12, %rbp
+; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
+; AVX512-NEXT: shldq %cl, %rdx, %r11
+; AVX512-NEXT: movq 40(%rdi), %rax
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: andq %r10, %rcx
+; AVX512-NEXT: movq 8(%rdi), %r15
+; AVX512-NEXT: movq %r15, %r12
+; AVX512-NEXT: andq %r11, %r12
+; AVX512-NEXT: orq %rcx, %r12
+; AVX512-NEXT: orq %r14, %r12
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; AVX512-NEXT: xorq %rax, %r10
+; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; AVX512-NEXT: xorq %r15, %r11
+; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX512-NEXT: orq %rbp, %r12
+; AVX512-NEXT: movq %r8, 48(%rdi)
+; AVX512-NEXT: movq %r13, 56(%rdi)
+; AVX512-NEXT: movq %r9, 32(%rdi)
+; AVX512-NEXT: movq %r10, 40(%rdi)
+; AVX512-NEXT: movq %rsi, 16(%rdi)
+; AVX512-NEXT: movq %rcx, 24(%rdi)
+; AVX512-NEXT: movq %rbx, (%rdi)
+; AVX512-NEXT: movq %r11, 8(%rdi)
+; AVX512-NEXT: setne %al
+; AVX512-NEXT: addq $72, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r12
+; AVX512-NEXT: popq %r13
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %r15
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%rem = and i32 %position, 511
%ofs = zext nneg i32 %rem to i512
%bit = shl nuw i512 1, %ofs
@@ -887,33 +2182,606 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {
define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind {
; X86-LABEL: reset_eq_i512:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $288, %esp # imm = 0x120
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrl $3, %eax
+; X86-NEXT: andl $60, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: leal {{[0-9]+}}(%esp), %edi
+; X86-NEXT: subl %eax, %edi
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 4(%edi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%edi), %eax
+; X86-NEXT: andl $31, %ecx
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: shldl %cl, %edx, %ebx
+; X86-NEXT: movl 12(%edi), %edx
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: andl $60, %esi
-; X86-NEXT: movl (%ecx,%esi), %edi
-; X86-NEXT: btl %edx, %edi
-; X86-NEXT: setae %al
-; X86-NEXT: btrl %edx, %edi
-; X86-NEXT: movl %edi, (%ecx,%esi)
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%edi), %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%edi), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 24(%edi), %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%edi), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%edi), %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%edi), %esi
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%edi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: movl 8(%ebp), %esi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %eax, %edx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %eax, %ebx
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%edi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl 52(%edi), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 56(%edi), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shldl %cl, %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%ebp), %esi
+; X86-NEXT: movl 56(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %eax, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 24(%esi), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ecx, %eax
+; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: movl 44(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%esi), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ecx, %eax
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 60(%edi), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 60(%ebx), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%ebx), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edx, %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%edi), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: negl %eax
+; X86-NEXT: movl 256(%esp,%eax), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl %cl, %edi, %eax
+; X86-NEXT: movl %esi, %edi
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: movl 32(%ebx), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ecx, %edx
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%ebx), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ecx, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%esi), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ecx, %ebx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%esi), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ecx, %eax
+; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%esi), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ebx, %edx
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 4(%esi), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ebx, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl %cl, %edi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%esi), %edi
+; X86-NEXT: andl %edi, %ecx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl 8(%ebp), %ebx
+; X86-NEXT: movl 52(%ebx), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ebx, %edx
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: andl %edi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: notl %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: notl %edi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: notl %edi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: notl %edi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: notl %ecx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl %edx, 60(%eax)
+; X86-NEXT: movl %esi, 56(%eax)
+; X86-NEXT: movl %ecx, 52(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 44(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 40(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 36(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 32(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 28(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 24(%eax)
+; X86-NEXT: movl %ebx, 20(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 16(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 12(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 8(%eax)
+; X86-NEXT: movl %edi, 4(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, (%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 48(%eax)
+; X86-NEXT: sete %al
+; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; X64-LABEL: reset_eq_i512:
-; X64: # %bb.0:
-; X64-NEXT: movl %esi, %ecx
-; X64-NEXT: shrl $3, %ecx
-; X64-NEXT: andl $60, %ecx
-; X64-NEXT: movl (%rdi,%rcx), %edx
-; X64-NEXT: btl %esi, %edx
-; X64-NEXT: setae %al
-; X64-NEXT: btrl %esi, %edx
-; X64-NEXT: movl %edx, (%rdi,%rcx)
-; X64-NEXT: retq
+; SSE-LABEL: reset_eq_i512:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: pushq %r15
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %r13
+; SSE-NEXT: pushq %r12
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: subq $56, %rsp
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl %esi, %ecx
+; SSE-NEXT: andl $63, %ecx
+; SSE-NEXT: shrl $3, %esi
+; SSE-NEXT: andl $56, %esi
+; SSE-NEXT: negl %esi
+; SSE-NEXT: movslq %esi, %rdx
+; SSE-NEXT: movq (%rsp,%rdx), %r9
+; SSE-NEXT: movq 8(%rsp,%rdx), %r8
+; SSE-NEXT: movq %r8, %rsi
+; SSE-NEXT: shldq %cl, %r9, %rsi
+; SSE-NEXT: movq -8(%rsp,%rdx), %rax
+; SSE-NEXT: shldq %cl, %rax, %r9
+; SSE-NEXT: movq 16(%rsp,%rdx), %r14
+; SSE-NEXT: movq 24(%rsp,%rdx), %r10
+; SSE-NEXT: movq %r10, %rbx
+; SSE-NEXT: shldq %cl, %r14, %rbx
+; SSE-NEXT: shldq %cl, %r8, %r14
+; SSE-NEXT: movq 32(%rsp,%rdx), %r13
+; SSE-NEXT: movq 40(%rsp,%rdx), %r12
+; SSE-NEXT: shldq %cl, %r13, %r12
+; SSE-NEXT: shldq %cl, %r10, %r13
+; SSE-NEXT: movq -16(%rsp,%rdx), %rdx
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT: shlq %cl, %rdx
+; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq %r12, %rbp
+; SSE-NEXT: movq %r9, %r15
+; SSE-NEXT: movq %rsi, %r11
+; SSE-NEXT: movq 16(%rdi), %r8
+; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 48(%rdi), %rcx
+; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq %rcx, %r13
+; SSE-NEXT: andq %r8, %r9
+; SSE-NEXT: orq %r13, %r9
+; SSE-NEXT: movq 56(%rdi), %rcx
+; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq %rcx, %r12
+; SSE-NEXT: movq 24(%rdi), %r10
+; SSE-NEXT: andq %r10, %rsi
+; SSE-NEXT: orq %r12, %rsi
+; SSE-NEXT: movq %r14, %r13
+; SSE-NEXT: movq 32(%rdi), %rcx
+; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq %rcx, %r14
+; SSE-NEXT: movq %rdx, %r12
+; SSE-NEXT: movq (%rdi), %rcx
+; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq %rcx, %rdx
+; SSE-NEXT: orq %r14, %rdx
+; SSE-NEXT: orq %r9, %rdx
+; SSE-NEXT: movq %rbx, %r14
+; SSE-NEXT: movq 40(%rdi), %rcx
+; SSE-NEXT: andq %rcx, %rbx
+; SSE-NEXT: movq %rax, %r9
+; SSE-NEXT: movq 8(%rdi), %r8
+; SSE-NEXT: andq %r8, %rax
+; SSE-NEXT: orq %rbx, %rax
+; SSE-NEXT: orq %rsi, %rax
+; SSE-NEXT: notq %r11
+; SSE-NEXT: andq %r10, %r11
+; SSE-NEXT: notq %r15
+; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; SSE-NEXT: notq %r14
+; SSE-NEXT: andq %rcx, %r14
+; SSE-NEXT: notq %r13
+; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; SSE-NEXT: notq %rbp
+; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE-NEXT: notq %rcx
+; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; SSE-NEXT: notq %r9
+; SSE-NEXT: andq %r8, %r9
+; SSE-NEXT: notq %r12
+; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; SSE-NEXT: orq %rdx, %rax
+; SSE-NEXT: movq %rcx, 48(%rdi)
+; SSE-NEXT: movq %rbp, 56(%rdi)
+; SSE-NEXT: movq %r13, 32(%rdi)
+; SSE-NEXT: movq %r14, 40(%rdi)
+; SSE-NEXT: movq %r15, 16(%rdi)
+; SSE-NEXT: movq %r11, 24(%rdi)
+; SSE-NEXT: movq %r12, (%rdi)
+; SSE-NEXT: movq %r9, 8(%rdi)
+; SSE-NEXT: sete %al
+; SSE-NEXT: addq $56, %rsp
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r12
+; SSE-NEXT: popq %r13
+; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %r15
+; SSE-NEXT: popq %rbp
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: reset_eq_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: pushq %rax
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl %esi, %ecx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: shrl $3, %esi
+; AVX2-NEXT: andl $56, %esi
+; AVX2-NEXT: negl %esi
+; AVX2-NEXT: movslq %esi, %rdx
+; AVX2-NEXT: movq -48(%rsp,%rdx), %r8
+; AVX2-NEXT: movq -40(%rsp,%rdx), %rbx
+; AVX2-NEXT: movq %rbx, %rax
+; AVX2-NEXT: shldq %cl, %r8, %rax
+; AVX2-NEXT: movq -16(%rsp,%rdx), %r10
+; AVX2-NEXT: movq -8(%rsp,%rdx), %rsi
+; AVX2-NEXT: shldq %cl, %r10, %rsi
+; AVX2-NEXT: movq -32(%rsp,%rdx), %r11
+; AVX2-NEXT: movq -24(%rsp,%rdx), %r14
+; AVX2-NEXT: movq %r14, %r9
+; AVX2-NEXT: shldq %cl, %r11, %r9
+; AVX2-NEXT: movq -64(%rsp,%rdx), %r15
+; AVX2-NEXT: movq -56(%rsp,%rdx), %rdx
+; AVX2-NEXT: shldq %cl, %rdx, %r8
+; AVX2-NEXT: shldq %cl, %r14, %r10
+; AVX2-NEXT: shldq %cl, %rbx, %r11
+; AVX2-NEXT: shldq %cl, %r15, %rdx
+; AVX2-NEXT: shlxq %rcx, %r15, %rcx
+; AVX2-NEXT: movq 24(%rdi), %rbx
+; AVX2-NEXT: movq 56(%rdi), %r14
+; AVX2-NEXT: movq 16(%rdi), %r15
+; AVX2-NEXT: movq 48(%rdi), %r13
+; AVX2-NEXT: movq 32(%rdi), %rbp
+; AVX2-NEXT: andnq %rbp, %r11, %r12
+; AVX2-NEXT: andq %r11, %rbp
+; AVX2-NEXT: andnq %r13, %r10, %r11
+; AVX2-NEXT: andq %r10, %r13
+; AVX2-NEXT: andnq %r15, %r8, %r10
+; AVX2-NEXT: andq %r8, %r15
+; AVX2-NEXT: movq 40(%rdi), %r8
+; AVX2-NEXT: orq %r13, %r15
+; AVX2-NEXT: andnq %r8, %r9, %r13
+; AVX2-NEXT: andq %r9, %r8
+; AVX2-NEXT: andnq %r14, %rsi, %r9
+; AVX2-NEXT: andq %rsi, %r14
+; AVX2-NEXT: andnq %rbx, %rax, %rsi
+; AVX2-NEXT: andq %rax, %rbx
+; AVX2-NEXT: movq (%rdi), %rax
+; AVX2-NEXT: orq %r14, %rbx
+; AVX2-NEXT: andnq %rax, %rcx, %r14
+; AVX2-NEXT: andq %rcx, %rax
+; AVX2-NEXT: orq %rbp, %rax
+; AVX2-NEXT: movq 8(%rdi), %rcx
+; AVX2-NEXT: orq %r15, %rax
+; AVX2-NEXT: andnq %rcx, %rdx, %r15
+; AVX2-NEXT: andq %rdx, %rcx
+; AVX2-NEXT: orq %r8, %rcx
+; AVX2-NEXT: orq %rbx, %rcx
+; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: movq %r11, 48(%rdi)
+; AVX2-NEXT: movq %r9, 56(%rdi)
+; AVX2-NEXT: movq %r12, 32(%rdi)
+; AVX2-NEXT: movq %r13, 40(%rdi)
+; AVX2-NEXT: movq %r10, 16(%rdi)
+; AVX2-NEXT: movq %rsi, 24(%rdi)
+; AVX2-NEXT: movq %r14, (%rdi)
+; AVX2-NEXT: movq %r15, 8(%rdi)
+; AVX2-NEXT: sete %al
+; AVX2-NEXT: addq $8, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: reset_eq_i512:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: pushq %r15
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: pushq %r13
+; AVX512-NEXT: pushq %r12
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: pushq %rax
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movl %esi, %ecx
+; AVX512-NEXT: andl $63, %ecx
+; AVX512-NEXT: shrl $3, %esi
+; AVX512-NEXT: andl $56, %esi
+; AVX512-NEXT: negl %esi
+; AVX512-NEXT: movslq %esi, %rbx
+; AVX512-NEXT: movq -48(%rsp,%rbx), %r8
+; AVX512-NEXT: movq -40(%rsp,%rbx), %r14
+; AVX512-NEXT: movq %r14, %rax
+; AVX512-NEXT: shldq %cl, %r8, %rax
+; AVX512-NEXT: movq -16(%rsp,%rbx), %r10
+; AVX512-NEXT: movq -8(%rsp,%rbx), %rsi
+; AVX512-NEXT: shldq %cl, %r10, %rsi
+; AVX512-NEXT: movq -32(%rsp,%rbx), %r11
+; AVX512-NEXT: movq -24(%rsp,%rbx), %r15
+; AVX512-NEXT: movq %r15, %r9
+; AVX512-NEXT: shldq %cl, %r11, %r9
+; AVX512-NEXT: movq -56(%rsp,%rbx), %rdx
+; AVX512-NEXT: shldq %cl, %rdx, %r8
+; AVX512-NEXT: shldq %cl, %r15, %r10
+; AVX512-NEXT: shldq %cl, %r14, %r11
+; AVX512-NEXT: movq -64(%rsp,%rbx), %rbx
+; AVX512-NEXT: shldq %cl, %rbx, %rdx
+; AVX512-NEXT: shlxq %rcx, %rbx, %rcx
+; AVX512-NEXT: movq 24(%rdi), %rbx
+; AVX512-NEXT: movq 56(%rdi), %r14
+; AVX512-NEXT: movq 16(%rdi), %r15
+; AVX512-NEXT: movq 48(%rdi), %r13
+; AVX512-NEXT: movq 32(%rdi), %rbp
+; AVX512-NEXT: andnq %rbp, %r11, %r12
+; AVX512-NEXT: andq %r11, %rbp
+; AVX512-NEXT: andnq %r13, %r10, %r11
+; AVX512-NEXT: andq %r10, %r13
+; AVX512-NEXT: andnq %r15, %r8, %r10
+; AVX512-NEXT: andq %r8, %r15
+; AVX512-NEXT: movq 40(%rdi), %r8
+; AVX512-NEXT: orq %r13, %r15
+; AVX512-NEXT: andnq %r8, %r9, %r13
+; AVX512-NEXT: andq %r9, %r8
+; AVX512-NEXT: andnq %r14, %rsi, %r9
+; AVX512-NEXT: andq %rsi, %r14
+; AVX512-NEXT: andnq %rbx, %rax, %rsi
+; AVX512-NEXT: andq %rax, %rbx
+; AVX512-NEXT: movq (%rdi), %rax
+; AVX512-NEXT: orq %r14, %rbx
+; AVX512-NEXT: andnq %rax, %rcx, %r14
+; AVX512-NEXT: andq %rcx, %rax
+; AVX512-NEXT: orq %rbp, %rax
+; AVX512-NEXT: movq 8(%rdi), %rcx
+; AVX512-NEXT: orq %r15, %rax
+; AVX512-NEXT: andnq %rcx, %rdx, %r15
+; AVX512-NEXT: andq %rdx, %rcx
+; AVX512-NEXT: orq %r8, %rcx
+; AVX512-NEXT: orq %rbx, %rcx
+; AVX512-NEXT: orq %rax, %rcx
+; AVX512-NEXT: movq %r11, 48(%rdi)
+; AVX512-NEXT: movq %r9, 56(%rdi)
+; AVX512-NEXT: movq %r12, 32(%rdi)
+; AVX512-NEXT: movq %r13, 40(%rdi)
+; AVX512-NEXT: movq %r10, 16(%rdi)
+; AVX512-NEXT: movq %rsi, 24(%rdi)
+; AVX512-NEXT: movq %r14, (%rdi)
+; AVX512-NEXT: movq %r15, 8(%rdi)
+; AVX512-NEXT: sete %al
+; AVX512-NEXT: addq $8, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r12
+; AVX512-NEXT: popq %r13
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %r15
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%rem = and i32 %position, 511
%ofs = zext nneg i32 %rem to i512
%bit = shl nuw i512 1, %ofs
@@ -929,33 +2797,572 @@ define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind {
define i1 @set_ne_i512(ptr %word, i32 %position) nounwind {
; X86-LABEL: set_ne_i512:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: andl $60, %esi
-; X86-NEXT: movl (%ecx,%esi), %edi
-; X86-NEXT: btl %edx, %edi
-; X86-NEXT: setb %al
-; X86-NEXT: btsl %edx, %edi
-; X86-NEXT: movl %edi, (%ecx,%esi)
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $272, %esp # imm = 0x110
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrl $3, %eax
+; X86-NEXT: andl $60, %eax
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: leal {{[0-9]+}}(%esp), %edx
+; X86-NEXT: subl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 24(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $31, %ecx
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 56(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 60(%edx), %esi
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%edx), %ebx
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%edx), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl 52(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 4(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%ebp), %edx
+; X86-NEXT: movl 40(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %esi, %eax
+; X86-NEXT: movl 8(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ebx, %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movl 56(%edx), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edi, %ebx
+; X86-NEXT: movl 24(%edx), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%eax), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl 12(%eax), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl 60(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl 28(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl (%eax), %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: negl %eax
+; X86-NEXT: movl 240(%esp,%eax), %esi
+; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%ebp), %esi
+; X86-NEXT: movl 32(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edi, %eax
+; X86-NEXT: movl (%esi), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edx, %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl 16(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ebx, %eax
+; X86-NEXT: movl 48(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 36(%esi), %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl 4(%esi), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl 20(%esi), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl %esi, %edi
+; X86-NEXT: movl 52(%eax), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl %ecx, (%esp) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movl 8(%ebp), %edx
+; X86-NEXT: movl %ebx, 60(%edx)
+; X86-NEXT: movl %edi, 56(%edx)
+; X86-NEXT: movl %ecx, 52(%edx)
+; X86-NEXT: movl %esi, 44(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 40(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 36(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 32(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 28(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 24(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 20(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 16(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 12(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 8(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 4(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, (%edx)
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 48(%edx)
+; X86-NEXT: setne %al
+; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; X64-LABEL: set_ne_i512:
-; X64: # %bb.0:
-; X64-NEXT: movl %esi, %ecx
-; X64-NEXT: shrl $3, %ecx
-; X64-NEXT: andl $60, %ecx
-; X64-NEXT: movl (%rdi,%rcx), %edx
-; X64-NEXT: btl %esi, %edx
-; X64-NEXT: setb %al
-; X64-NEXT: btsl %esi, %edx
-; X64-NEXT: movl %edx, (%rdi,%rcx)
-; X64-NEXT: retq
+; SSE-LABEL: set_ne_i512:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: pushq %r15
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %r13
+; SSE-NEXT: pushq %r12
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: subq $56, %rsp
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl %esi, %ecx
+; SSE-NEXT: andl $63, %ecx
+; SSE-NEXT: shrl $3, %esi
+; SSE-NEXT: andl $56, %esi
+; SSE-NEXT: negl %esi
+; SSE-NEXT: movslq %esi, %rbx
+; SSE-NEXT: movq (%rsp,%rbx), %rsi
+; SSE-NEXT: movq 8(%rsp,%rbx), %r14
+; SSE-NEXT: movq %r14, %rax
+; SSE-NEXT: shldq %cl, %rsi, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 32(%rsp,%rbx), %r8
+; SSE-NEXT: movq 40(%rsp,%rbx), %rbp
+; SSE-NEXT: shldq %cl, %r8, %rbp
+; SSE-NEXT: movq 16(%rsp,%rbx), %r9
+; SSE-NEXT: movq 24(%rsp,%rbx), %r15
+; SSE-NEXT: movq %r15, %r10
+; SSE-NEXT: shldq %cl, %r9, %r10
+; SSE-NEXT: movq -8(%rsp,%rbx), %r11
+; SSE-NEXT: shldq %cl, %r11, %rsi
+; SSE-NEXT: shldq %cl, %r15, %r8
+; SSE-NEXT: shldq %cl, %r14, %r9
+; SSE-NEXT: movq -16(%rsp,%rbx), %rbx
+; SSE-NEXT: shldq %cl, %rbx, %r11
+; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT: shlq %cl, %rbx
+; SSE-NEXT: movq 24(%rdi), %r15
+; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 56(%rdi), %rcx
+; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 16(%rdi), %r12
+; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 48(%rdi), %r13
+; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq %r8, %r13
+; SSE-NEXT: andq %rsi, %r12
+; SSE-NEXT: orq %r13, %r12
+; SSE-NEXT: movq %rcx, %r13
+; SSE-NEXT: andq %rbp, %r13
+; SSE-NEXT: andq %rax, %r15
+; SSE-NEXT: orq %r13, %r15
+; SSE-NEXT: movq 32(%rdi), %r14
+; SSE-NEXT: movq %r14, %rcx
+; SSE-NEXT: andq %r9, %rcx
+; SSE-NEXT: movq (%rdi), %r13
+; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq %rbx, %r13
+; SSE-NEXT: orq %rcx, %r13
+; SSE-NEXT: orq %r12, %r13
+; SSE-NEXT: movq 40(%rdi), %rcx
+; SSE-NEXT: movq %rcx, %r12
+; SSE-NEXT: andq %r10, %r12
+; SSE-NEXT: movq 8(%rdi), %rdx
+; SSE-NEXT: movq %rdx, %rax
+; SSE-NEXT: andq %r11, %rax
+; SSE-NEXT: orq %r12, %rax
+; SSE-NEXT: orq %r15, %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; SSE-NEXT: orq %rcx, %r10
+; SSE-NEXT: orq %r14, %r9
+; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; SSE-NEXT: orq %rdx, %r11
+; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; SSE-NEXT: orq %r13, %rax
+; SSE-NEXT: movq %r8, 48(%rdi)
+; SSE-NEXT: movq %rbp, 56(%rdi)
+; SSE-NEXT: movq %r9, 32(%rdi)
+; SSE-NEXT: movq %r10, 40(%rdi)
+; SSE-NEXT: movq %rsi, 16(%rdi)
+; SSE-NEXT: movq %r15, 24(%rdi)
+; SSE-NEXT: movq %rbx, (%rdi)
+; SSE-NEXT: movq %r11, 8(%rdi)
+; SSE-NEXT: setne %al
+; SSE-NEXT: addq $56, %rsp
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r12
+; SSE-NEXT: popq %r13
+; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %r15
+; SSE-NEXT: popq %rbp
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: set_ne_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: subq $72, %rsp
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
+; AVX2-NEXT: vmovups %ymm0, (%rsp)
+; AVX2-NEXT: movl %esi, %ecx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: shrl $3, %esi
+; AVX2-NEXT: andl $56, %esi
+; AVX2-NEXT: negl %esi
+; AVX2-NEXT: movslq %esi, %rbx
+; AVX2-NEXT: movq 16(%rsp,%rbx), %rsi
+; AVX2-NEXT: movq 24(%rsp,%rbx), %rbp
+; AVX2-NEXT: movq %rbp, %rax
+; AVX2-NEXT: shldq %cl, %rsi, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 48(%rsp,%rbx), %r8
+; AVX2-NEXT: movq 56(%rsp,%rbx), %r13
+; AVX2-NEXT: shldq %cl, %r8, %r13
+; AVX2-NEXT: movq 32(%rsp,%rbx), %r9
+; AVX2-NEXT: movq 40(%rsp,%rbx), %r14
+; AVX2-NEXT: movq %r14, %r10
+; AVX2-NEXT: shldq %cl, %r9, %r10
+; AVX2-NEXT: movq 8(%rsp,%rbx), %r11
+; AVX2-NEXT: shldq %cl, %r11, %rsi
+; AVX2-NEXT: shldq %cl, %r14, %r8
+; AVX2-NEXT: movq 16(%rdi), %r12
+; AVX2-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 48(%rdi), %r14
+; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: andq %r8, %r14
+; AVX2-NEXT: andq %rsi, %r12
+; AVX2-NEXT: orq %r14, %r12
+; AVX2-NEXT: movq 56(%rdi), %r15
+; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: andq %r13, %r15
+; AVX2-NEXT: movq 24(%rdi), %r14
+; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: andq %rax, %r14
+; AVX2-NEXT: orq %r15, %r14
+; AVX2-NEXT: shldq %cl, %rbp, %r9
+; AVX2-NEXT: movq (%rsp,%rbx), %rdx
+; AVX2-NEXT: movq 32(%rdi), %r15
+; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: andq %r9, %r15
+; AVX2-NEXT: shlxq %rcx, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq (%rdi), %rbx
+; AVX2-NEXT: movq %rbx, %rbp
+; AVX2-NEXT: andq %rax, %rbp
+; AVX2-NEXT: orq %r15, %rbp
+; AVX2-NEXT: orq %r12, %rbp
+; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
+; AVX2-NEXT: shldq %cl, %rdx, %r11
+; AVX2-NEXT: movq 40(%rdi), %rax
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: andq %r10, %rcx
+; AVX2-NEXT: movq 8(%rdi), %r15
+; AVX2-NEXT: movq %r15, %r12
+; AVX2-NEXT: andq %r11, %r12
+; AVX2-NEXT: orq %rcx, %r12
+; AVX2-NEXT: orq %r14, %r12
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; AVX2-NEXT: orq %rax, %r10
+; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; AVX2-NEXT: orq %r15, %r11
+; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX2-NEXT: orq %rbp, %r12
+; AVX2-NEXT: movq %r8, 48(%rdi)
+; AVX2-NEXT: movq %r13, 56(%rdi)
+; AVX2-NEXT: movq %r9, 32(%rdi)
+; AVX2-NEXT: movq %r10, 40(%rdi)
+; AVX2-NEXT: movq %rsi, 16(%rdi)
+; AVX2-NEXT: movq %rcx, 24(%rdi)
+; AVX2-NEXT: movq %rbx, (%rdi)
+; AVX2-NEXT: movq %r11, 8(%rdi)
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: addq $72, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: set_ne_i512:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: pushq %r15
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: pushq %r13
+; AVX512-NEXT: pushq %r12
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: subq $72, %rsp
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
+; AVX512-NEXT: vmovups %ymm0, (%rsp)
+; AVX512-NEXT: movl %esi, %ecx
+; AVX512-NEXT: andl $63, %ecx
+; AVX512-NEXT: shrl $3, %esi
+; AVX512-NEXT: andl $56, %esi
+; AVX512-NEXT: negl %esi
+; AVX512-NEXT: movslq %esi, %rbx
+; AVX512-NEXT: movq 16(%rsp,%rbx), %rsi
+; AVX512-NEXT: movq 24(%rsp,%rbx), %rbp
+; AVX512-NEXT: movq %rbp, %rax
+; AVX512-NEXT: shldq %cl, %rsi, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 48(%rsp,%rbx), %r8
+; AVX512-NEXT: movq 56(%rsp,%rbx), %r13
+; AVX512-NEXT: shldq %cl, %r8, %r13
+; AVX512-NEXT: movq 32(%rsp,%rbx), %r9
+; AVX512-NEXT: movq 40(%rsp,%rbx), %r14
+; AVX512-NEXT: movq %r14, %r10
+; AVX512-NEXT: shldq %cl, %r9, %r10
+; AVX512-NEXT: movq 8(%rsp,%rbx), %r11
+; AVX512-NEXT: shldq %cl, %r11, %rsi
+; AVX512-NEXT: shldq %cl, %r14, %r8
+; AVX512-NEXT: movq 16(%rdi), %r12
+; AVX512-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 48(%rdi), %r14
+; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: andq %r8, %r14
+; AVX512-NEXT: andq %rsi, %r12
+; AVX512-NEXT: orq %r14, %r12
+; AVX512-NEXT: movq 56(%rdi), %r15
+; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: andq %r13, %r15
+; AVX512-NEXT: movq 24(%rdi), %r14
+; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: andq %rax, %r14
+; AVX512-NEXT: orq %r15, %r14
+; AVX512-NEXT: shldq %cl, %rbp, %r9
+; AVX512-NEXT: movq (%rsp,%rbx), %rdx
+; AVX512-NEXT: movq 32(%rdi), %r15
+; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: andq %r9, %r15
+; AVX512-NEXT: shlxq %rcx, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq (%rdi), %rbx
+; AVX512-NEXT: movq %rbx, %rbp
+; AVX512-NEXT: andq %rax, %rbp
+; AVX512-NEXT: orq %r15, %rbp
+; AVX512-NEXT: orq %r12, %rbp
+; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
+; AVX512-NEXT: shldq %cl, %rdx, %r11
+; AVX512-NEXT: movq 40(%rdi), %rax
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: andq %r10, %rcx
+; AVX512-NEXT: movq 8(%rdi), %r15
+; AVX512-NEXT: movq %r15, %r12
+; AVX512-NEXT: andq %r11, %r12
+; AVX512-NEXT: orq %rcx, %r12
+; AVX512-NEXT: orq %r14, %r12
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; AVX512-NEXT: orq %rax, %r10
+; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; AVX512-NEXT: orq %r15, %r11
+; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX512-NEXT: orq %rbp, %r12
+; AVX512-NEXT: movq %r8, 48(%rdi)
+; AVX512-NEXT: movq %r13, 56(%rdi)
+; AVX512-NEXT: movq %r9, 32(%rdi)
+; AVX512-NEXT: movq %r10, 40(%rdi)
+; AVX512-NEXT: movq %rsi, 16(%rdi)
+; AVX512-NEXT: movq %rcx, 24(%rdi)
+; AVX512-NEXT: movq %rbx, (%rdi)
+; AVX512-NEXT: movq %r11, 8(%rdi)
+; AVX512-NEXT: setne %al
+; AVX512-NEXT: addq $72, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r12
+; AVX512-NEXT: popq %r13
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %r15
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%rem = and i32 %position, 511
%ofs = zext nneg i32 %rem to i512
%bit = shl nuw i512 1, %ofs
@@ -976,14 +3383,13 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $352, %esp # imm = 0x160
+; X86-NEXT: subl $432, %esp # imm = 0x1B0
; X86-NEXT: movl 12(%ebp), %ecx
; X86-NEXT: movl %ecx, %edx
; X86-NEXT: shrl $3, %edx
; X86-NEXT: andl $60, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT: subl %edx, %eax
+; X86-NEXT: leal {{[0-9]+}}(%esp), %esi
+; X86-NEXT: subl %edx, %esi
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -1016,58 +3422,60 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 56(%eax), %esi
-; X86-NEXT: movl 60(%eax), %ebx
-; X86-NEXT: movl 52(%eax), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 48(%eax), %edi
-; X86-NEXT: movl 44(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 24(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 4(%eax), %eax
+; X86-NEXT: movl 56(%esi), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl 16(%ebp), %eax
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: andl $31, %ecx
-; X86-NEXT: shldl %cl, %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, %eax
+; X86-NEXT: movl 60(%esi), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl %cl, %ebx, %edi
+; X86-NEXT: movl 52(%esi), %eax
+; X86-NEXT: movl 48(%esi), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%esi), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%esi), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%esi), %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%esi), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%esi), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 24(%esi), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%esi), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%esi), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%esi), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%esi), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%esi), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 4(%esi), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movzbl 16(%ebp), %ebx
+; X86-NEXT: movzbl %bl, %esi
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: leal {{[0-9]+}}(%esp), %esi
+; X86-NEXT: subl %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $31, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: shldl %cl, %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl %cl, %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: shldl %cl, %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -1092,12 +3500,9 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: shldl %cl, %eax, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: shldl %cl, %edi, %edx
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -1129,148 +3534,273 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 56(%eax), %esi
-; X86-NEXT: movl 60(%eax), %edi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: movl 8(%ebp), %edx
-; X86-NEXT: andl 60(%edx), %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 52(%eax), %edi
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: andl 56(%edx), %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 48(%eax), %esi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: andl 52(%edx), %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 44(%eax), %edi
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: andl 48(%edx), %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 40(%eax), %esi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: andl 44(%edx), %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 36(%eax), %edi
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: andl 40(%edx), %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 32(%eax), %esi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: andl 36(%edx), %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 28(%eax), %edi
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: andl 32(%edx), %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 24(%eax), %esi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: andl 28(%edx), %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 20(%eax), %edi
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: andl 24(%edx), %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 16(%eax), %esi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: andl 20(%edx), %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 12(%eax), %edi
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: andl 16(%edx), %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 8(%eax), %esi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: andl 12(%edx), %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 4(%eax), %edi
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: andl 8(%edx), %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: notl %esi
-; X86-NEXT: movl (%eax), %eax
-; X86-NEXT: shldl %cl, %eax, %edi
-; X86-NEXT: andl 4(%edx), %esi
-; X86-NEXT: orl %edi, %esi
-; X86-NEXT: movl %esi, %edi
+; X86-NEXT: movl 8(%ebp), %ebx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%ebx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %esi, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: notl %esi
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: andl (%edx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl (%edx,%eax), %eax
-; X86-NEXT: btl %ecx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 60(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 56(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 52(%edx)
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%ebx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %eax, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%ebx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %eax, %edx
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 48(%edx)
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%ebx), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edx, %eax
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 56(%ebx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %esi, %edx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 24(%ebx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %esi, %edx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 44(%edx)
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 52(%ebx), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edx, %eax
+; X86-NEXT: movl %eax, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 40(%edx)
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%ebx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %esi, %eax
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 36(%edx)
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%ebx), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%ebx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %esi, %edi
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 60(%ebx), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%ebx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %eax, %edx
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 32(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%ebx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %esi, %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%ebx), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edi, %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 28(%edx)
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%ebx), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ecx, %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 4(%ebx), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edx, %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl 56(%edi), %ebx
+; X86-NEXT: movl 60(%edi), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl %cl, %ebx, %eax
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 52(%edi), %eax
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 48(%edi), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 24(%edx)
+; X86-NEXT: notl %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl 40(%edi), %ebx
+; X86-NEXT: movl 44(%edi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %ebx, %eax
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 36(%edi), %eax
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 32(%edi), %ebx
+; X86-NEXT: shldl %cl, %ebx, %eax
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 28(%edi), %eax
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 24(%edi), %ebx
+; X86-NEXT: shldl %cl, %ebx, %eax
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 20(%edi), %eax
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 16(%edi), %ebx
+; X86-NEXT: shldl %cl, %ebx, %eax
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 12(%edi), %eax
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: notl %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl 8(%edi), %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 20(%edx)
+; X86-NEXT: notl %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl 4(%edi), %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl %cl, %ebx, %edx
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 16(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl (%edi), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: notl %edi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: movl %edi, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 12(%edx)
-; X86-NEXT: movl %ebx, 8(%edx)
-; X86-NEXT: movl %edi, 4(%edx)
-; X86-NEXT: movl %esi, (%edx)
-; X86-NEXT: setae %al
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, 60(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, 56(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, 52(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, 44(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, 40(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, 36(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, 32(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, 28(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, 24(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, 20(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, 16(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, 12(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, 8(%eax)
+; X86-NEXT: movl %edx, 4(%eax)
+; X86-NEXT: movl %ecx, (%eax)
+; X86-NEXT: movl %esi, 48(%eax)
+; X86-NEXT: sete %al
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
@@ -1286,8 +3816,7 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; SSE-NEXT: pushq %r13
; SSE-NEXT: pushq %r12
; SSE-NEXT: pushq %rbx
-; SSE-NEXT: subq $184, %rsp
-; SSE-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT: subq $216, %rsp
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
@@ -1300,103 +3829,139 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp)
; SSE-NEXT: movl %esi, %ecx
; SSE-NEXT: andl $63, %ecx
-; SSE-NEXT: movl %esi, %eax
-; SSE-NEXT: shrl $3, %eax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: # kill: def $eax killed $eax killed $rax
-; SSE-NEXT: andl $56, %eax
-; SSE-NEXT: negl %eax
-; SSE-NEXT: movslq %eax, %r12
-; SSE-NEXT: movq 160(%rsp,%r12), %rax
-; SSE-NEXT: movq 168(%rsp,%r12), %r10
-; SSE-NEXT: shldq %cl, %rax, %r10
-; SSE-NEXT: movq 152(%rsp,%r12), %rsi
-; SSE-NEXT: shldq %cl, %rsi, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 144(%rsp,%r12), %r11
-; SSE-NEXT: shldq %cl, %r11, %rsi
-; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 136(%rsp,%r12), %rbx
-; SSE-NEXT: shldq %cl, %rbx, %r11
-; SSE-NEXT: movq 128(%rsp,%r12), %r14
-; SSE-NEXT: shldq %cl, %r14, %rbx
-; SSE-NEXT: movq 120(%rsp,%r12), %r15
-; SSE-NEXT: shldq %cl, %r15, %r14
-; SSE-NEXT: movq 112(%rsp,%r12), %r13
-; SSE-NEXT: shldq %cl, %r13, %r15
-; SSE-NEXT: shlq %cl, %r13
-; SSE-NEXT: movl %edx, %eax
+; SSE-NEXT: shrl $3, %esi
+; SSE-NEXT: andl $56, %esi
+; SSE-NEXT: negl %esi
+; SSE-NEXT: movslq %esi, %r10
+; SSE-NEXT: movq 184(%rsp,%r10), %r11
+; SSE-NEXT: movq 192(%rsp,%r10), %rsi
+; SSE-NEXT: movq %rsi, %r13
+; SSE-NEXT: shldq %cl, %r11, %r13
+; SSE-NEXT: movq 200(%rsp,%r10), %r15
+; SSE-NEXT: shldq %cl, %rsi, %r15
+; SSE-NEXT: movq 168(%rsp,%r10), %rbx
+; SSE-NEXT: movq 176(%rsp,%r10), %rsi
+; SSE-NEXT: movq %rsi, %r14
+; SSE-NEXT: shldq %cl, %rbx, %r14
+; SSE-NEXT: shldq %cl, %rsi, %r11
+; SSE-NEXT: movq 152(%rsp,%r10), %rax
+; SSE-NEXT: movq 160(%rsp,%r10), %r8
+; SSE-NEXT: movq %r8, %r12
+; SSE-NEXT: shldq %cl, %rax, %r12
+; SSE-NEXT: shldq %cl, %r8, %rbx
+; SSE-NEXT: movq 144(%rsp,%r10), %r9
+; SSE-NEXT: movq %r9, %r8
+; SSE-NEXT: shlq %cl, %r8
+; SSE-NEXT: shldq %cl, %r9, %rax
+; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movl %edx, %edx
; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, (%rsp)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movq 32(%rsp,%r12), %rax
-; SSE-NEXT: movq 40(%rsp,%r12), %rdx
-; SSE-NEXT: shldq %cl, %rax, %rdx
+; SSE-NEXT: movq 16(%rdi), %rdx
; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 24(%rsp,%r12), %rdx
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq 16(%rsp,%r12), %rsi
-; SSE-NEXT: shldq %cl, %rsi, %rdx
-; SSE-NEXT: movq 8(%rsp,%r12), %r8
-; SSE-NEXT: shldq %cl, %r8, %rsi
-; SSE-NEXT: movq (%rsp,%r12), %rbp
-; SSE-NEXT: shldq %cl, %rbp, %r8
-; SSE-NEXT: movq -8(%rsp,%r12), %r9
-; SSE-NEXT: shldq %cl, %r9, %rbp
-; SSE-NEXT: notq %r10
-; SSE-NEXT: andq 56(%rdi), %r10
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
-; SSE-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE-NEXT: notq %r10
-; SSE-NEXT: andq 48(%rdi), %r10
-; SSE-NEXT: orq %rax, %r10
+; SSE-NEXT: movq 48(%rdi), %rsi
+; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq %rsi, %r13
+; SSE-NEXT: andq %rdx, %r12
+; SSE-NEXT: orq %r13, %r12
+; SSE-NEXT: movq %r15, %rsi
+; SSE-NEXT: movq 56(%rdi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq %rdx, %r15
+; SSE-NEXT: movq %rbx, %r13
+; SSE-NEXT: movq 24(%rdi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq %rdx, %rbx
+; SSE-NEXT: orq %r15, %rbx
+; SSE-NEXT: movq %r14, %rbp
+; SSE-NEXT: movq 32(%rdi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq %rdx, %r14
+; SSE-NEXT: movq %r8, %r15
+; SSE-NEXT: movq (%rdi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq %rdx, %r8
+; SSE-NEXT: orq %r14, %r8
+; SSE-NEXT: orq %r12, %r8
+; SSE-NEXT: movq %r11, %r12
+; SSE-NEXT: movq 40(%rdi), %r9
+; SSE-NEXT: andq %r9, %r11
+; SSE-NEXT: movq %rax, %r14
+; SSE-NEXT: movq 8(%rdi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq %rdx, %rax
+; SSE-NEXT: orq %r11, %rax
+; SSE-NEXT: orq %rbx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSE-NEXT: notq %rax
-; SSE-NEXT: andq 40(%rdi), %rax
-; SSE-NEXT: orq %rdx, %rax
+; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
; SSE-NEXT: movq %rax, %rdx
+; SSE-NEXT: movq 56(%rsp,%r10), %r11
+; SSE-NEXT: movq 64(%rsp,%r10), %rax
+; SSE-NEXT: movq %rax, %rbx
+; SSE-NEXT: shldq %cl, %r11, %rbx
+; SSE-NEXT: orq %rbx, %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: notq %rsi
+; SSE-NEXT: movq 72(%rsp,%r10), %rbx
+; SSE-NEXT: shldq %cl, %rax, %rbx
+; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; SSE-NEXT: orq %rbx, %rsi
+; SSE-NEXT: notq %rbp
+; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; SSE-NEXT: movq 40(%rsp,%r10), %rax
+; SSE-NEXT: movq 48(%rsp,%r10), %rdx
+; SSE-NEXT: movq %rdx, %rbx
+; SSE-NEXT: shldq %cl, %rax, %rbx
+; SSE-NEXT: orq %rbx, %rbp
+; SSE-NEXT: notq %r12
+; SSE-NEXT: andq %r9, %r12
+; SSE-NEXT: shldq %cl, %rdx, %r11
+; SSE-NEXT: movq 24(%rsp,%r10), %r9
+; SSE-NEXT: movq 32(%rsp,%r10), %rdx
+; SSE-NEXT: movq %rdx, %rbx
+; SSE-NEXT: shldq %cl, %r9, %rbx
+; SSE-NEXT: orq %r11, %r12
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
; SSE-NEXT: notq %r11
-; SSE-NEXT: andq 32(%rdi), %r11
-; SSE-NEXT: orq %rsi, %r11
-; SSE-NEXT: notq %rbx
-; SSE-NEXT: andq 24(%rdi), %rbx
-; SSE-NEXT: orq %r8, %rbx
-; SSE-NEXT: notq %r14
-; SSE-NEXT: andq 16(%rdi), %r14
-; SSE-NEXT: orq %rbp, %r14
-; SSE-NEXT: notq %r15
-; SSE-NEXT: movq -16(%rsp,%r12), %rax
-; SSE-NEXT: shldq %cl, %rax, %r9
-; SSE-NEXT: andq 8(%rdi), %r15
-; SSE-NEXT: orq %r9, %r15
+; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: orq %rbx, %r11
; SSE-NEXT: notq %r13
-; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT: shlq %cl, %rax
-; SSE-NEXT: andq (%rdi), %r13
+; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
; SSE-NEXT: orq %rax, %r13
+; SSE-NEXT: notq %r15
+; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; SSE-NEXT: movq 16(%rsp,%r10), %rax
+; SSE-NEXT: movq %rax, %rdx
+; SSE-NEXT: shlq %cl, %rdx
+; SSE-NEXT: orq %rdx, %r15
+; SSE-NEXT: notq %r14
+; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT: shldq %cl, %rax, %r9
+; SSE-NEXT: orq %r9, %r14
+; SSE-NEXT: orq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andl $60, %eax
-; SSE-NEXT: movl (%rdi,%rax), %eax
-; SSE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; SSE-NEXT: btl %ecx, %eax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: movq %rax, 56(%rdi)
-; SSE-NEXT: movq %r10, 48(%rdi)
-; SSE-NEXT: movq %rdx, 40(%rdi)
-; SSE-NEXT: movq %r11, 32(%rdi)
-; SSE-NEXT: movq %rbx, 24(%rdi)
-; SSE-NEXT: movq %r14, 16(%rdi)
-; SSE-NEXT: movq %r15, 8(%rdi)
-; SSE-NEXT: movq %r13, (%rdi)
-; SSE-NEXT: setae %al
-; SSE-NEXT: addq $184, %rsp
+; SSE-NEXT: movq %rax, 48(%rdi)
+; SSE-NEXT: movq %rsi, 56(%rdi)
+; SSE-NEXT: movq %rbp, 32(%rdi)
+; SSE-NEXT: movq %r12, 40(%rdi)
+; SSE-NEXT: movq %r11, 16(%rdi)
+; SSE-NEXT: movq %r13, 24(%rdi)
+; SSE-NEXT: movq %r15, (%rdi)
+; SSE-NEXT: movq %r14, 8(%rdi)
+; SSE-NEXT: sete %al
+; SSE-NEXT: addq $216, %rsp
; SSE-NEXT: popq %rbx
; SSE-NEXT: popq %r12
; SSE-NEXT: popq %r13
@@ -1413,103 +3978,132 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; AVX2-NEXT: pushq %r13
; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: subq $168, %rsp
+; AVX2-NEXT: subq $200, %rsp
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1,0,0,0]
; AVX2-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: movl %esi, %r11d
-; AVX2-NEXT: shrl $3, %r11d
-; AVX2-NEXT: movl %r11d, %eax
-; AVX2-NEXT: andl $56, %eax
-; AVX2-NEXT: negl %eax
-; AVX2-NEXT: movslq %eax, %r10
-; AVX2-NEXT: movq 104(%rsp,%r10), %r15
-; AVX2-NEXT: movq 112(%rsp,%r10), %rax
-; AVX2-NEXT: movq %rax, %rsi
-; AVX2-NEXT: shldq %cl, %r15, %rsi
-; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 120(%rsp,%r10), %rsi
-; AVX2-NEXT: movq %rsi, %r8
-; AVX2-NEXT: shldq %cl, %rax, %r8
-; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 128(%rsp,%r10), %rax
-; AVX2-NEXT: movq %rax, %rbx
-; AVX2-NEXT: shldq %cl, %rsi, %rbx
-; AVX2-NEXT: movq 136(%rsp,%r10), %rsi
-; AVX2-NEXT: movq %rsi, %r14
-; AVX2-NEXT: shldq %cl, %rax, %r14
-; AVX2-NEXT: movq 144(%rsp,%r10), %rax
-; AVX2-NEXT: movq %rax, %r12
-; AVX2-NEXT: shldq %cl, %rsi, %r12
-; AVX2-NEXT: movq 96(%rsp,%r10), %rsi
-; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 152(%rsp,%r10), %r13
-; AVX2-NEXT: shldq %cl, %rax, %r13
-; AVX2-NEXT: shldq %cl, %rsi, %r15
-; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: movl %esi, %r8d
+; AVX2-NEXT: andl $63, %r8d
+; AVX2-NEXT: shrl $3, %esi
+; AVX2-NEXT: andl $56, %esi
+; AVX2-NEXT: negl %esi
+; AVX2-NEXT: movslq %esi, %rsi
+; AVX2-NEXT: movq 144(%rsp,%rsi), %r11
+; AVX2-NEXT: movq 152(%rsp,%rsi), %r12
+; AVX2-NEXT: movq %r12, %r10
+; AVX2-NEXT: movl %r8d, %ecx
+; AVX2-NEXT: shldq %cl, %r11, %r10
+; AVX2-NEXT: movq 176(%rsp,%rsi), %r14
+; AVX2-NEXT: movq 184(%rsp,%rsi), %r9
+; AVX2-NEXT: shldq %cl, %r14, %r9
+; AVX2-NEXT: movq 160(%rsp,%rsi), %r15
+; AVX2-NEXT: movq 168(%rsp,%rsi), %r13
+; AVX2-NEXT: movq %r13, %rbx
+; AVX2-NEXT: shldq %cl, %r15, %rbx
+; AVX2-NEXT: movq 128(%rsp,%rsi), %rbp
+; AVX2-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 136(%rsp,%rsi), %rax
+; AVX2-NEXT: shldq %cl, %rax, %r11
+; AVX2-NEXT: shldq %cl, %r13, %r14
+; AVX2-NEXT: shldq %cl, %r12, %r15
+; AVX2-NEXT: shldq %cl, %rbp, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movl %edx, %edx
; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rdx, (%rsp)
; AVX2-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq 16(%rsp,%r10), %rbp
-; AVX2-NEXT: movq 24(%rsp,%r10), %r9
-; AVX2-NEXT: shldq %cl, %rbp, %r9
-; AVX2-NEXT: movq 8(%rsp,%r10), %rdx
-; AVX2-NEXT: shldq %cl, %rdx, %rbp
-; AVX2-NEXT: movq (%rsp,%r10), %rax
-; AVX2-NEXT: shldq %cl, %rax, %rdx
-; AVX2-NEXT: movq -8(%rsp,%r10), %r8
-; AVX2-NEXT: shldq %cl, %r8, %rax
-; AVX2-NEXT: movq -16(%rsp,%r10), %rsi
-; AVX2-NEXT: shldq %cl, %rsi, %r8
-; AVX2-NEXT: andnq 56(%rdi), %r13, %r13
-; AVX2-NEXT: orq %r9, %r13
-; AVX2-NEXT: movq -24(%rsp,%r10), %r9
-; AVX2-NEXT: shldq %cl, %r9, %rsi
-; AVX2-NEXT: andnq 48(%rdi), %r12, %r12
-; AVX2-NEXT: andnq 40(%rdi), %r14, %r14
+; AVX2-NEXT: movq 16(%rdi), %r12
+; AVX2-NEXT: movq 48(%rdi), %rbp
+; AVX2-NEXT: movq 32(%rdi), %r13
+; AVX2-NEXT: andnq %r13, %r15, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: andq %r15, %r13
+; AVX2-NEXT: andnq %rbp, %r14, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: andq %r14, %rbp
+; AVX2-NEXT: andnq %r12, %r11, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: andq %r11, %r12
+; AVX2-NEXT: movq 40(%rdi), %rax
; AVX2-NEXT: orq %rbp, %r12
-; AVX2-NEXT: orq %rdx, %r14
-; AVX2-NEXT: andnq 32(%rdi), %rbx, %rdx
-; AVX2-NEXT: orq %rax, %rdx
-; AVX2-NEXT: shlxq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; AVX2-NEXT: movq -32(%rsp,%r10), %r10
-; AVX2-NEXT: shlxq %rcx, %r10, %rbx
-; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX2-NEXT: shldq %cl, %r10, %r9
+; AVX2-NEXT: andnq %rax, %rbx, %rcx
+; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq %rax, %rbp
+; AVX2-NEXT: andq %rbx, %rbp
+; AVX2-NEXT: movq 56(%rdi), %rcx
+; AVX2-NEXT: andnq %rcx, %r9, %rbx
+; AVX2-NEXT: andq %r9, %rcx
+; AVX2-NEXT: movq 24(%rdi), %rax
+; AVX2-NEXT: andnq %rax, %r10, %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: andq %r10, %rax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: shlxq %r8, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX2-NEXT: movq (%rdi), %r10
+; AVX2-NEXT: andnq %r10, %rcx, %r15
+; AVX2-NEXT: andq %rcx, %r10
+; AVX2-NEXT: movq 40(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq 48(%rsp,%rsi), %r11
+; AVX2-NEXT: movq %r11, %r9
+; AVX2-NEXT: movl %r8d, %ecx
+; AVX2-NEXT: shldq %cl, %rdx, %r9
+; AVX2-NEXT: orq %r13, %r10
+; AVX2-NEXT: orq %r12, %r10
+; AVX2-NEXT: movq 8(%rdi), %r13
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT: andnq 24(%rdi), %rcx, %rcx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: andnq 16(%rdi), %r10, %r10
-; AVX2-NEXT: orq %r8, %rcx
-; AVX2-NEXT: orq %rsi, %r10
-; AVX2-NEXT: andnq 8(%rdi), %r15, %rsi
-; AVX2-NEXT: orq %r9, %rsi
-; AVX2-NEXT: andnq (%rdi), %rax, %rax
-; AVX2-NEXT: orq %rbx, %rax
-; AVX2-NEXT: andl $60, %r11d
-; AVX2-NEXT: movl (%rdi,%r11), %r8d
-; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload
-; AVX2-NEXT: btl %r9d, %r8d
-; AVX2-NEXT: movq %r13, 56(%rdi)
-; AVX2-NEXT: movq %r12, 48(%rdi)
-; AVX2-NEXT: movq %r14, 40(%rdi)
-; AVX2-NEXT: movq %rdx, 32(%rdi)
-; AVX2-NEXT: movq %rcx, 24(%rdi)
-; AVX2-NEXT: movq %r10, 16(%rdi)
-; AVX2-NEXT: movq %rsi, 8(%rdi)
-; AVX2-NEXT: movq %rax, (%rdi)
-; AVX2-NEXT: setae %al
-; AVX2-NEXT: addq $168, %rsp
+; AVX2-NEXT: andnq %r13, %rcx, %r12
+; AVX2-NEXT: andq %rcx, %r13
+; AVX2-NEXT: orq %rbp, %r13
+; AVX2-NEXT: orq %rax, %r13
+; AVX2-NEXT: movq 56(%rsp,%rsi), %rax
+; AVX2-NEXT: movl %r8d, %ecx
+; AVX2-NEXT: shldq %cl, %r11, %rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT: orq %r9, %r14
+; AVX2-NEXT: orq %rax, %rbx
+; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 24(%rsp,%rsi), %rax
+; AVX2-NEXT: movq 32(%rsp,%rsi), %r9
+; AVX2-NEXT: movq %r9, %r11
+; AVX2-NEXT: shldq %cl, %rax, %r11
+; AVX2-NEXT: shldq %cl, %r9, %rdx
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX2-NEXT: orq %r11, %rbp
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX2-NEXT: orq %rdx, %rbx
+; AVX2-NEXT: movq 8(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq 16(%rsp,%rsi), %r9
+; AVX2-NEXT: movq %r9, %r11
+; AVX2-NEXT: shldq %cl, %rdx, %r11
+; AVX2-NEXT: shldq %cl, %r9, %rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX2-NEXT: orq %r11, %r9
+; AVX2-NEXT: movq (%rsp,%rsi), %rsi
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT: orq %rax, %r11
+; AVX2-NEXT: shlxq %r8, %rsi, %rax
+; AVX2-NEXT: shldq %cl, %rsi, %rdx
+; AVX2-NEXT: orq %rax, %r15
+; AVX2-NEXT: orq %rdx, %r12
+; AVX2-NEXT: orq %r10, %r13
+; AVX2-NEXT: movq %r14, 48(%rdi)
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: movq %rax, 56(%rdi)
+; AVX2-NEXT: movq %rbp, 32(%rdi)
+; AVX2-NEXT: movq %rbx, 40(%rdi)
+; AVX2-NEXT: movq %r9, 16(%rdi)
+; AVX2-NEXT: movq %r11, 24(%rdi)
+; AVX2-NEXT: movq %r15, (%rdi)
+; AVX2-NEXT: movq %r12, 8(%rdi)
+; AVX2-NEXT: sete %al
+; AVX2-NEXT: addq $200, %rsp
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r12
; AVX2-NEXT: popq %r13
@@ -1527,100 +4121,131 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; AVX512-NEXT: pushq %r13
; AVX512-NEXT: pushq %r12
; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: subq $152, %rsp
+; AVX512-NEXT: subq $184, %rsp
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1,0,0,0]
; AVX512-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512-NEXT: movl %esi, %ecx
; AVX512-NEXT: andl $63, %ecx
-; AVX512-NEXT: movl %esi, %r8d
-; AVX512-NEXT: shrl $3, %r8d
-; AVX512-NEXT: movl %r8d, %eax
-; AVX512-NEXT: andl $56, %eax
-; AVX512-NEXT: negl %eax
-; AVX512-NEXT: movslq %eax, %r9
-; AVX512-NEXT: movq 88(%rsp,%r9), %r10
-; AVX512-NEXT: movq 96(%rsp,%r9), %rax
-; AVX512-NEXT: movq %rax, %rsi
-; AVX512-NEXT: shldq %cl, %r10, %rsi
-; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 104(%rsp,%r9), %rsi
-; AVX512-NEXT: movq %rsi, %r11
-; AVX512-NEXT: shldq %cl, %rax, %r11
-; AVX512-NEXT: movq 112(%rsp,%r9), %rax
-; AVX512-NEXT: movq %rax, %rbx
-; AVX512-NEXT: shldq %cl, %rsi, %rbx
-; AVX512-NEXT: movq 120(%rsp,%r9), %rsi
-; AVX512-NEXT: movq %rsi, %r14
-; AVX512-NEXT: shldq %cl, %rax, %r14
-; AVX512-NEXT: movq 128(%rsp,%r9), %rax
-; AVX512-NEXT: movq %rax, %r12
-; AVX512-NEXT: shldq %cl, %rsi, %r12
-; AVX512-NEXT: movq 136(%rsp,%r9), %r13
-; AVX512-NEXT: shldq %cl, %rax, %r13
-; AVX512-NEXT: movq 80(%rsp,%r9), %r15
-; AVX512-NEXT: shldq %cl, %r15, %r10
-; AVX512-NEXT: movl %edx, %eax
+; AVX512-NEXT: shrl $3, %esi
+; AVX512-NEXT: andl $56, %esi
+; AVX512-NEXT: negl %esi
+; AVX512-NEXT: movslq %esi, %rsi
+; AVX512-NEXT: movq 128(%rsp,%rsi), %r10
+; AVX512-NEXT: movq 136(%rsp,%rsi), %r12
+; AVX512-NEXT: movq %r12, %rax
+; AVX512-NEXT: shldq %cl, %r10, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 160(%rsp,%rsi), %r14
+; AVX512-NEXT: movq 168(%rsp,%rsi), %rax
+; AVX512-NEXT: shldq %cl, %r14, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 144(%rsp,%rsi), %r15
+; AVX512-NEXT: movq 152(%rsp,%rsi), %r11
+; AVX512-NEXT: movq %r11, %rbx
+; AVX512-NEXT: shldq %cl, %r15, %rbx
+; AVX512-NEXT: movq 120(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rax, %r10
+; AVX512-NEXT: shldq %cl, %r11, %r14
+; AVX512-NEXT: movq %rdi, %r9
+; AVX512-NEXT: movq 112(%rsp,%rsi), %r11
+; AVX512-NEXT: shldq %cl, %r12, %r15
+; AVX512-NEXT: movl %edx, %edx
; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp)
; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq (%rsp,%r9), %rbp
-; AVX512-NEXT: movq 8(%rsp,%r9), %rsi
-; AVX512-NEXT: shldq %cl, %rbp, %rsi
-; AVX512-NEXT: movq -8(%rsp,%r9), %rdx
-; AVX512-NEXT: shldq %cl, %rdx, %rbp
-; AVX512-NEXT: movq -16(%rsp,%r9), %rax
-; AVX512-NEXT: shldq %cl, %rax, %rdx
-; AVX512-NEXT: andnq 56(%rdi), %r13, %r13
-; AVX512-NEXT: andnq 48(%rdi), %r12, %r12
-; AVX512-NEXT: orq %rsi, %r13
-; AVX512-NEXT: orq %rbp, %r12
-; AVX512-NEXT: andnq 40(%rdi), %r14, %r14
-; AVX512-NEXT: orq %rdx, %r14
-; AVX512-NEXT: movq -24(%rsp,%r9), %rsi
-; AVX512-NEXT: shldq %cl, %rsi, %rax
-; AVX512-NEXT: andnq 32(%rdi), %rbx, %rdx
-; AVX512-NEXT: orq %rax, %rdx
-; AVX512-NEXT: movq -32(%rsp,%r9), %rax
-; AVX512-NEXT: shldq %cl, %rax, %rsi
-; AVX512-NEXT: shlxq %rcx, %r15, %rbx
-; AVX512-NEXT: andnq 24(%rdi), %r11, %r11
-; AVX512-NEXT: orq %rsi, %r11
-; AVX512-NEXT: movq -48(%rsp,%r9), %rsi
-; AVX512-NEXT: movq -40(%rsp,%r9), %r9
-; AVX512-NEXT: shldq %cl, %r9, %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: andnq 16(%rdi), %r15, %r15
+; AVX512-NEXT: movq 16(%rdi), %r12
+; AVX512-NEXT: movq 48(%rdi), %r13
+; AVX512-NEXT: movq 32(%rdi), %rbp
+; AVX512-NEXT: andnq %rbp, %r15, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: andq %r15, %rbp
+; AVX512-NEXT: andnq %r13, %r14, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: andq %r14, %r13
+; AVX512-NEXT: andnq %r12, %r10, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: andq %r10, %r12
+; AVX512-NEXT: movq 40(%rdi), %r8
+; AVX512-NEXT: orq %r13, %r12
+; AVX512-NEXT: andnq %r8, %rbx, %rdi
+; AVX512-NEXT: andq %rbx, %r8
+; AVX512-NEXT: movq 56(%r9), %r13
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX512-NEXT: andnq %r13, %rdx, %r10
+; AVX512-NEXT: andq %rdx, %r13
+; AVX512-NEXT: movq 24(%r9), %rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX512-NEXT: andnq %rax, %rdx, %r15
+; AVX512-NEXT: andq %rdx, %rax
+; AVX512-NEXT: orq %r13, %rax
+; AVX512-NEXT: shlxq %rcx, %r11, %r13
+; AVX512-NEXT: movq (%r9), %rdx
+; AVX512-NEXT: andnq %rdx, %r13, %r14
+; AVX512-NEXT: andq %r13, %rdx
+; AVX512-NEXT: orq %rbp, %rdx
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r11, %rbp
+; AVX512-NEXT: orq %r12, %rdx
+; AVX512-NEXT: movq 8(%r9), %r13
+; AVX512-NEXT: andnq %r13, %rbp, %rbx
+; AVX512-NEXT: andq %rbp, %r13
+; AVX512-NEXT: orq %r8, %r13
+; AVX512-NEXT: movq 24(%rsp,%rsi), %r8
+; AVX512-NEXT: orq %rax, %r13
+; AVX512-NEXT: movq 32(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, %r12
+; AVX512-NEXT: shldq %cl, %r8, %r12
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX512-NEXT: orq %r12, %r11
+; AVX512-NEXT: movq 40(%rsp,%rsi), %r12
+; AVX512-NEXT: shldq %cl, %rax, %r12
+; AVX512-NEXT: orq %r12, %r10
+; AVX512-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 8(%rsp,%rsi), %rax
+; AVX512-NEXT: movq 16(%rsp,%rsi), %r12
+; AVX512-NEXT: movq %r12, %rbp
+; AVX512-NEXT: shldq %cl, %rax, %rbp
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT: orq %rbp, %r10
+; AVX512-NEXT: shldq %cl, %r12, %r8
+; AVX512-NEXT: orq %r8, %rdi
+; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq -8(%rsp,%rsi), %r8
+; AVX512-NEXT: movq (%rsp,%rsi), %r12
+; AVX512-NEXT: movq %r12, %rbp
+; AVX512-NEXT: shldq %cl, %r8, %rbp
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; AVX512-NEXT: orq %rbp, %rdi
+; AVX512-NEXT: movq -16(%rsp,%rsi), %rsi
+; AVX512-NEXT: shldq %cl, %r12, %rax
; AVX512-NEXT: orq %rax, %r15
; AVX512-NEXT: shlxq %rcx, %rsi, %rax
; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT: shldq %cl, %rsi, %r9
-; AVX512-NEXT: andnq 8(%rdi), %r10, %rcx
-; AVX512-NEXT: orq %r9, %rcx
-; AVX512-NEXT: andnq (%rdi), %rbx, %rsi
-; AVX512-NEXT: orq %rax, %rsi
-; AVX512-NEXT: andl $60, %r8d
-; AVX512-NEXT: movl (%rdi,%r8), %eax
-; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload
-; AVX512-NEXT: btl %r8d, %eax
-; AVX512-NEXT: movq %r13, 56(%rdi)
-; AVX512-NEXT: movq %r12, 48(%rdi)
-; AVX512-NEXT: movq %r14, 40(%rdi)
-; AVX512-NEXT: movq %rdx, 32(%rdi)
-; AVX512-NEXT: movq %r11, 24(%rdi)
-; AVX512-NEXT: movq %r15, 16(%rdi)
-; AVX512-NEXT: movq %rcx, 8(%rdi)
-; AVX512-NEXT: movq %rsi, (%rdi)
-; AVX512-NEXT: setae %al
-; AVX512-NEXT: addq $152, %rsp
+; AVX512-NEXT: shldq %cl, %rsi, %r8
+; AVX512-NEXT: orq %rax, %r14
+; AVX512-NEXT: orq %r8, %rbx
+; AVX512-NEXT: orq %rdx, %r13
+; AVX512-NEXT: movq %r11, 48(%r9)
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: movq %rax, 56(%r9)
+; AVX512-NEXT: movq %r10, 32(%r9)
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: movq %rax, 40(%r9)
+; AVX512-NEXT: movq %rdi, 16(%r9)
+; AVX512-NEXT: movq %r15, 24(%r9)
+; AVX512-NEXT: movq %r14, (%r9)
+; AVX512-NEXT: movq %rbx, 8(%r9)
+; AVX512-NEXT: sete %al
+; AVX512-NEXT: addq $184, %rsp
; AVX512-NEXT: popq %rbx
; AVX512-NEXT: popq %r12
; AVX512-NEXT: popq %r13
@@ -1649,25 +4274,2749 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {
; X86-LABEL: test_ne_i4096:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: andl $4064, %edx # imm = 0xFE0
-; X86-NEXT: shrl $3, %edx
-; X86-NEXT: movl (%eax,%edx), %eax
-; X86-NEXT: btl %ecx, %eax
-; X86-NEXT: setb %al
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $1792, %esp # imm = 0x700
+; X86-NEXT: movl 12(%ebp), %ebx
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: shrl $3, %ecx
+; X86-NEXT: andl $508, %ecx # imm = 0x1FC
+; X86-NEXT: leal {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: subl %ecx, %esi
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 248(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 252(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $31, %ebx
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 504(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 508(%esi), %edx
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 120(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 124(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 376(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 380(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 184(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 188(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 440(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 444(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 56(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 60(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 312(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 316(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 216(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 220(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 472(%esi), %edi
+; X86-NEXT: movl 476(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 88(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 92(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 344(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 348(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 152(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 156(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 408(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 412(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 24(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 280(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 284(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 232(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 236(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 488(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 492(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 104(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 108(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 360(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 364(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 168(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 172(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 424(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 428(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 296(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 300(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 200(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 204(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 456(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 460(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 72(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 76(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 328(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 332(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 136(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 140(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 392(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 396(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 264(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 268(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 240(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 244(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 496(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 500(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 112(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 116(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 368(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 372(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 176(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 180(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 432(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 436(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 52(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 304(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 308(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 208(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 212(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 464(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 468(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 80(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 84(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 336(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 340(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 144(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 148(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 400(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 404(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 272(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 276(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 224(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 228(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 480(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 484(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 96(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 100(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 352(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 356(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 160(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 164(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 416(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 420(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 288(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 292(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 192(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 196(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 448(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 452(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 64(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 68(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 320(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 324(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 128(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 132(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: movl 256(%esi), %edi
+; X86-NEXT: movl 260(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl %cl, %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl 388(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl 4(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shrdl $1, %eax, %edi
+; X86-NEXT: shrl %eax
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: notb %cl
+; X86-NEXT: shrdl %cl, %eax, %edi
+; X86-NEXT: shrl %cl, %ebx
+; X86-NEXT: movb $32, %cl
+; X86-NEXT: testb %cl, %cl
+; X86-NEXT: movl (%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: jne .LBB20_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: .LBB20_2:
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shll %cl, %edx
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 320(%eax), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl 64(%eax), %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 448(%eax), %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 192(%eax), %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 288(%eax), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 32(%eax), %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 416(%eax), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: andl 160(%eax), %edi
+; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 352(%eax), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl 96(%eax), %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 480(%eax), %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 224(%eax), %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 272(%eax), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 16(%eax), %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 400(%eax), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl 144(%eax), %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 336(%eax), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 80(%eax), %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 464(%eax), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: andl 208(%eax), %edi
+; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 304(%eax), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 48(%eax), %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 432(%eax), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl 176(%eax), %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 368(%eax), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 112(%eax), %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 496(%eax), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: andl 240(%eax), %ebx
+; X86-NEXT: orl %ecx, %ebx
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 264(%eax), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 8(%eax), %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 392(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 136(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 328(%ebx), %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 72(%ebx), %eax
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: andl 456(%ebx), %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl 200(%ebx), %esi
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: andl 296(%ebx), %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 40(%ebx), %eax
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: andl 424(%ebx), %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 168(%ebx), %edx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 360(%ebx), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 104(%ebx), %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 488(%ebx), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: andl 232(%ebx), %edi
+; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 280(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 24(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 408(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 152(%ebx), %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 344(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 88(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 472(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl 216(%ebx), %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 312(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 56(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 440(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 184(%ebx), %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 376(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 120(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 504(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: andl 248(%ebx), %edi
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 324(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 68(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 452(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 196(%ebx), %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 292(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 36(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 420(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 164(%ebx), %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 356(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 100(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 484(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl 228(%ebx), %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 276(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 20(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 404(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 148(%ebx), %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 340(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 84(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 468(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl 212(%ebx), %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 308(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 52(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 436(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 180(%ebx), %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 372(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 116(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 500(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: andl 244(%ebx), %edi
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 268(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 12(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 396(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 140(%ebx), %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 332(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 76(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 460(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: andl 204(%ebx), %edi
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 300(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 44(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 428(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 172(%ebx), %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 364(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 108(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 492(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl 236(%ebx), %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 284(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 28(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 412(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: andl 156(%ebx), %edi
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 348(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 92(%ebx), %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 476(%ebx), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 220(%ebx), %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 316(%ebx), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 60(%ebx), %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 444(%ebx), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: andl 188(%ebx), %edi
+; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 380(%ebx), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 124(%ebx), %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 508(%ebx), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl 8(%ebp), %esi
+; X86-NEXT: andl 252(%esi), %ebx
+; X86-NEXT: orl %ecx, %ebx
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: orl %eax, %ebx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: negl %ecx
+; X86-NEXT: movl 1648(%esp,%ecx), %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl 8(%ebp), %edx
+; X86-NEXT: andl 128(%edx), %ecx
+; X86-NEXT: andl 384(%edx), %edi
+; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: andl (%edx), %eax
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 256(%edx), %eax
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 260(%edx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 4(%edx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 132(%edx), %eax
+; X86-NEXT: andl 388(%edx), %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: setne %al
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; X64-LABEL: test_ne_i4096:
-; X64: # %bb.0:
-; X64-NEXT: movl %esi, %eax
-; X64-NEXT: andl $4064, %eax # imm = 0xFE0
-; X64-NEXT: shrl $3, %eax
-; X64-NEXT: movl (%rdi,%rax), %eax
-; X64-NEXT: btl %esi, %eax
-; X64-NEXT: setb %al
-; X64-NEXT: retq
+; SSE-LABEL: test_ne_i4096:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: pushq %r15
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %r13
+; SSE-NEXT: pushq %r12
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: subq $1576, %rsp # imm = 0x628
+; SSE-NEXT: movl %esi, %ecx
+; SSE-NEXT: movl %esi, %eax
+; SSE-NEXT: andl $4032, %eax # imm = 0xFC0
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: andl $63, %ecx
+; SSE-NEXT: shrl $3, %eax
+; SSE-NEXT: negl %eax
+; SSE-NEXT: movslq %eax, %rsi
+; SSE-NEXT: movq 1296(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1304(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1552(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1560(%rsp,%rsi), %rax
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1168(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1176(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1424(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1432(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1232(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1240(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1488(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1496(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1104(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1112(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1360(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, (%rsp) # 8-byte Spill
+; SSE-NEXT: movq 1368(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1264(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1272(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1520(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1528(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1136(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1144(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1392(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1400(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1200(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1208(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1456(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1464(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1072(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1080(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1328(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1336(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1280(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1288(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1536(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1544(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1152(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1160(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1408(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1416(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1216(%rsp,%rsi), %r11
+; SSE-NEXT: movq 1224(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %r11, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1472(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1480(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1088(%rsp,%rsi), %r9
+; SSE-NEXT: movq 1096(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %r9, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1344(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1352(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1248(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1256(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rax, %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1504(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1512(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1120(%rsp,%rsi), %rax
+; SSE-NEXT: movq 1128(%rsp,%rsi), %r8
+; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rax, %r8
+; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1376(%rsp,%rsi), %r13
+; SSE-NEXT: movq 1384(%rsp,%rsi), %rbx
+; SSE-NEXT: movq %rbx, %r8
+; SSE-NEXT: shldq %cl, %r13, %r8
+; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1184(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1192(%rsp,%rsi), %r15
+; SSE-NEXT: movq %r15, %r14
+; SSE-NEXT: shldq %cl, %rdx, %r14
+; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1440(%rsp,%rsi), %r10
+; SSE-NEXT: movq 1448(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, %r14
+; SSE-NEXT: shldq %cl, %r10, %r14
+; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1312(%rsp,%rsi), %r14
+; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1320(%rsp,%rsi), %rbp
+; SSE-NEXT: movq %rbp, %r12
+; SSE-NEXT: shldq %cl, %r14, %r12
+; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r8, (%rsp) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq 1064(%rsp,%rsi), %rbx
+; SSE-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %rbp, %r14
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT: shldq %cl, %rdx, %r11
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r15, %rdx
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r15, %r9
+; SSE-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r15, %r8
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r15, %rbp
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r15, %r9
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r15, %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r15, %r13
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r12, %r15
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r12, %r10
+; SSE-NEXT: andq 384(%rdi), %r10
+; SSE-NEXT: andq 128(%rdi), %r15
+; SSE-NEXT: andq 320(%rdi), %r13
+; SSE-NEXT: andq 64(%rdi), %rax
+; SSE-NEXT: orq %r10, %r15
+; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: orq %r13, %rax
+; SSE-NEXT: andq 448(%rdi), %r9
+; SSE-NEXT: andq 192(%rdi), %rbp
+; SSE-NEXT: orq %r9, %rbp
+; SSE-NEXT: orq %rax, %rbp
+; SSE-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq 288(%rdi), %r8
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; SSE-NEXT: andq 32(%rdi), %r9
+; SSE-NEXT: andq 416(%rdi), %rdx
+; SSE-NEXT: andq 160(%rdi), %r11
+; SSE-NEXT: orq %r8, %r9
+; SSE-NEXT: orq %rdx, %r11
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT: andq 352(%rdi), %rdx
+; SSE-NEXT: orq %r9, %r11
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 96(%rdi), %rax
+; SSE-NEXT: orq %rdx, %rax
+; SSE-NEXT: movq %rax, %rdx
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 480(%rdi), %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: andq 224(%rdi), %r8
+; SSE-NEXT: orq %rax, %r8
+; SSE-NEXT: orq %rdx, %r8
+; SSE-NEXT: andq 272(%rdi), %r14
+; SSE-NEXT: orq %r11, %r8
+; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 16(%rdi), %rax
+; SSE-NEXT: orq %r14, %rax
+; SSE-NEXT: movq %rax, %r8
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT: andq 400(%rdi), %rdx
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 144(%rdi), %rax
+; SSE-NEXT: orq %rdx, %rax
+; SSE-NEXT: orq %r8, %rax
+; SSE-NEXT: movq %rax, %r8
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; SSE-NEXT: andq 336(%rdi), %r9
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 80(%rdi), %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT: andq 464(%rdi), %rdx
+; SSE-NEXT: orq %r9, %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE-NEXT: andq 208(%rdi), %r11
+; SSE-NEXT: orq %rdx, %r11
+; SSE-NEXT: orq %rax, %r11
+; SSE-NEXT: orq %r8, %r11
+; SSE-NEXT: movq (%rsp), %rdx # 8-byte Reload
+; SSE-NEXT: andq 304(%rdi), %rdx
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 48(%rdi), %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; SSE-NEXT: andq 432(%rdi), %r9
+; SSE-NEXT: orq %rdx, %rax
+; SSE-NEXT: movq %rax, %r10
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: andq 176(%rdi), %r8
+; SSE-NEXT: orq %r9, %r8
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; SSE-NEXT: andq 368(%rdi), %r9
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 112(%rdi), %rax
+; SSE-NEXT: orq %r10, %r8
+; SSE-NEXT: movq %r8, %r10
+; SSE-NEXT: orq %r9, %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: andq 496(%rdi), %r8
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; SSE-NEXT: andq 240(%rdi), %rbp
+; SSE-NEXT: orq %r8, %rbp
+; SSE-NEXT: orq %rax, %rbp
+; SSE-NEXT: orq %r10, %rbp
+; SSE-NEXT: orq %r11, %rbp
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 392(%rdi), %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; SSE-NEXT: andq 136(%rdi), %r12
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT: andq 328(%rdi), %rdx
+; SSE-NEXT: orq %rax, %r12
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 72(%rdi), %rax
+; SSE-NEXT: orq %rdx, %rax
+; SSE-NEXT: movq %rax, %rdx
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 456(%rdi), %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; SSE-NEXT: andq 200(%rdi), %r13
+; SSE-NEXT: orq %rax, %r13
+; SSE-NEXT: orq %rdx, %r13
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT: andq 296(%rdi), %rdx
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 40(%rdi), %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: andq 424(%rdi), %r8
+; SSE-NEXT: orq %rdx, %rax
+; SSE-NEXT: movq %rax, %r9
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT: andq 168(%rdi), %rdx
+; SSE-NEXT: orq %r8, %rdx
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: andq 360(%rdi), %r8
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 104(%rdi), %rax
+; SSE-NEXT: orq %r9, %rdx
+; SSE-NEXT: orq %r8, %rax
+; SSE-NEXT: movq %rax, %r8
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 488(%rdi), %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT: andq 232(%rdi), %r15
+; SSE-NEXT: orq %rax, %r15
+; SSE-NEXT: orq %r8, %r15
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: andq 280(%rdi), %r8
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 24(%rdi), %rax
+; SSE-NEXT: orq %rdx, %r15
+; SSE-NEXT: orq %r8, %rax
+; SSE-NEXT: movq %rax, %r10
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: andq 408(%rdi), %r8
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 152(%rdi), %rax
+; SSE-NEXT: orq %r8, %rax
+; SSE-NEXT: orq %r10, %rax
+; SSE-NEXT: movq %rax, %r10
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE-NEXT: andq 344(%rdi), %r11
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: andq 88(%rdi), %r8
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 472(%rdi), %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; SSE-NEXT: andq 216(%rdi), %r14
+; SSE-NEXT: orq %r11, %r8
+; SSE-NEXT: orq %rax, %r14
+; SSE-NEXT: orq %r8, %r14
+; SSE-NEXT: orq %r10, %r14
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE-NEXT: andq 312(%rdi), %r11
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE-NEXT: andq 56(%rdi), %r10
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: andq 440(%rdi), %r8
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; SSE-NEXT: andq 184(%rdi), %r9
+; SSE-NEXT: orq %r11, %r10
+; SSE-NEXT: orq %r8, %r9
+; SSE-NEXT: orq %r10, %r9
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT: shldq %cl, %rax, %rdx
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE-NEXT: andq 376(%rdi), %r10
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 120(%rdi), %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE-NEXT: andq 504(%rdi), %r11
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: andq 248(%rdi), %r8
+; SSE-NEXT: orq %r10, %rax
+; SSE-NEXT: movq %rax, %r10
+; SSE-NEXT: orq %r11, %r8
+; SSE-NEXT: movq 1056(%rsp,%rsi), %rax
+; SSE-NEXT: shldq %cl, %rax, %rbx
+; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT: shlq %cl, %rax
+; SSE-NEXT: orq %r10, %r8
+; SSE-NEXT: orq %r9, %r8
+; SSE-NEXT: andq 256(%rdi), %rdx
+; SSE-NEXT: orq %r14, %r8
+; SSE-NEXT: andq (%rdi), %rax
+; SSE-NEXT: orq %rdx, %rax
+; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; SSE-NEXT: orq %rbp, %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE-NEXT: andq 264(%rdi), %rcx
+; SSE-NEXT: andq 8(%rdi), %rbx
+; SSE-NEXT: orq %rcx, %rbx
+; SSE-NEXT: orq %r12, %rbx
+; SSE-NEXT: orq %r13, %rbx
+; SSE-NEXT: orq %r15, %rbx
+; SSE-NEXT: orq %r8, %rbx
+; SSE-NEXT: orq %rax, %rbx
+; SSE-NEXT: setne %al
+; SSE-NEXT: addq $1576, %rsp # imm = 0x628
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r12
+; SSE-NEXT: popq %r13
+; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %r15
+; SSE-NEXT: popq %rbp
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test_ne_i4096:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: subq $1560, %rsp # imm = 0x618
+; AVX2-NEXT: movl %esi, %ecx
+; AVX2-NEXT: movl %esi, %eax
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: andl $4032, %eax # imm = 0xFC0
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: shrl $3, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: movslq %eax, %rsi
+; AVX2-NEXT: movq 1280(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1288(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1536(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1544(%rsp,%rsi), %rax
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1152(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1160(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1408(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1416(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1216(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, (%rsp) # 8-byte Spill
+; AVX2-NEXT: movq 1224(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1472(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1480(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1088(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1096(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1344(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1352(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1248(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1256(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1504(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1512(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1120(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1128(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1376(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1384(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1184(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1192(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1440(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1448(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1056(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1064(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1312(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1320(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1264(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1272(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1520(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1528(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1136(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1144(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1392(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1400(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1200(%rsp,%rsi), %r11
+; AVX2-NEXT: movq 1208(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %r11, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1456(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1464(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1072(%rsp,%rsi), %r12
+; AVX2-NEXT: movq 1080(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %r12, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1328(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1336(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1232(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1240(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rax, %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1488(%rsp,%rsi), %rbp
+; AVX2-NEXT: movq 1496(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rbp, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1104(%rsp,%rsi), %rax
+; AVX2-NEXT: movq 1112(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rax, %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1360(%rsp,%rsi), %r10
+; AVX2-NEXT: movq 1368(%rsp,%rsi), %r8
+; AVX2-NEXT: movq %r8, %rdx
+; AVX2-NEXT: shldq %cl, %r10, %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1168(%rsp,%rsi), %r9
+; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1176(%rsp,%rsi), %rbx
+; AVX2-NEXT: movq %rbx, %rdx
+; AVX2-NEXT: shldq %cl, %r9, %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1424(%rsp,%rsi), %r9
+; AVX2-NEXT: movq 1432(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, %r14
+; AVX2-NEXT: shldq %cl, %r9, %r14
+; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1296(%rsp,%rsi), %r15
+; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1304(%rsp,%rsi), %r14
+; AVX2-NEXT: movq %r14, %r13
+; AVX2-NEXT: shldq %cl, %r15, %r13
+; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r15, (%rsp) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq 1048(%rsp,%rsi), %rdx
+; AVX2-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r14, %rbx
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r14, %r11
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r14, %r12
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r14, %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r14, %r13
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r14, %rbp
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r14, %rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r14, %r10
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r15, %r14
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r15, %r9
+; AVX2-NEXT: andq 384(%rdi), %r9
+; AVX2-NEXT: andq 128(%rdi), %r14
+; AVX2-NEXT: andq 320(%rdi), %r10
+; AVX2-NEXT: orq %r9, %r14
+; AVX2-NEXT: movq %r14, %r15
+; AVX2-NEXT: andq 64(%rdi), %rax
+; AVX2-NEXT: orq %r10, %rax
+; AVX2-NEXT: andq 448(%rdi), %rbp
+; AVX2-NEXT: andq 192(%rdi), %r13
+; AVX2-NEXT: orq %rbp, %r13
+; AVX2-NEXT: orq %rax, %r13
+; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: andq 288(%rdi), %r8
+; AVX2-NEXT: andq 32(%rdi), %r12
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 416(%rdi), %rax
+; AVX2-NEXT: orq %r8, %r12
+; AVX2-NEXT: andq 160(%rdi), %r11
+; AVX2-NEXT: orq %rax, %r11
+; AVX2-NEXT: andq 352(%rdi), %rbx
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 96(%rdi), %rax
+; AVX2-NEXT: orq %r12, %r11
+; AVX2-NEXT: orq %rbx, %rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT: andq 480(%rdi), %r10
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX2-NEXT: andq 224(%rdi), %r13
+; AVX2-NEXT: orq %r10, %r13
+; AVX2-NEXT: orq %rax, %r13
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andq 272(%rdi), %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 16(%rdi), %rax
+; AVX2-NEXT: orq %r11, %r13
+; AVX2-NEXT: orq %r8, %rax
+; AVX2-NEXT: movq %rax, %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX2-NEXT: andq 400(%rdi), %r9
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 144(%rdi), %rax
+; AVX2-NEXT: orq %r9, %rax
+; AVX2-NEXT: orq %r8, %rax
+; AVX2-NEXT: movq %rax, %r9
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT: andq 336(%rdi), %r10
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 80(%rdi), %rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andq 464(%rdi), %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT: andq 208(%rdi), %r11
+; AVX2-NEXT: orq %r10, %rax
+; AVX2-NEXT: orq %r8, %r11
+; AVX2-NEXT: orq %rax, %r11
+; AVX2-NEXT: orq %r9, %r11
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX2-NEXT: andq 304(%rdi), %r9
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andq 48(%rdi), %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT: andq 432(%rdi), %r10
+; AVX2-NEXT: movq (%rsp), %rax # 8-byte Reload
+; AVX2-NEXT: andq 176(%rdi), %rax
+; AVX2-NEXT: orq %r9, %r8
+; AVX2-NEXT: movq %r8, %r9
+; AVX2-NEXT: orq %r10, %rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andq 368(%rdi), %r8
+; AVX2-NEXT: orq %r9, %rax
+; AVX2-NEXT: movq %rax, %r10
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 112(%rdi), %rax
+; AVX2-NEXT: orq %r8, %rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andq 496(%rdi), %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX2-NEXT: andq 240(%rdi), %r9
+; AVX2-NEXT: orq %r8, %r9
+; AVX2-NEXT: orq %rax, %r9
+; AVX2-NEXT: orq %r10, %r9
+; AVX2-NEXT: orq %r11, %r9
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT: andq 392(%rdi), %r10
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX2-NEXT: andq 136(%rdi), %rbp
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andq 328(%rdi), %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 72(%rdi), %rax
+; AVX2-NEXT: orq %r10, %rbp
+; AVX2-NEXT: orq %r8, %rax
+; AVX2-NEXT: movq %rax, %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 456(%rdi), %rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; AVX2-NEXT: andq 200(%rdi), %r12
+; AVX2-NEXT: orq %rax, %r12
+; AVX2-NEXT: orq %r8, %r12
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT: andq 296(%rdi), %r10
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andq 40(%rdi), %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT: andq 424(%rdi), %r11
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 168(%rdi), %rax
+; AVX2-NEXT: orq %r10, %r8
+; AVX2-NEXT: movq %r8, %r10
+; AVX2-NEXT: orq %r11, %rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andq 360(%rdi), %r8
+; AVX2-NEXT: orq %r10, %rax
+; AVX2-NEXT: movq %rax, %r10
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 104(%rdi), %rax
+; AVX2-NEXT: orq %r8, %rax
+; AVX2-NEXT: movq %rax, %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 488(%rdi), %rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT: andq 232(%rdi), %r14
+; AVX2-NEXT: orq %rax, %r14
+; AVX2-NEXT: orq %r8, %r14
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andq 280(%rdi), %r8
+; AVX2-NEXT: orq %r10, %r14
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 24(%rdi), %rax
+; AVX2-NEXT: orq %r8, %rax
+; AVX2-NEXT: movq %rax, %r10
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andq 408(%rdi), %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 152(%rdi), %rax
+; AVX2-NEXT: orq %r8, %rax
+; AVX2-NEXT: orq %r10, %rax
+; AVX2-NEXT: movq %rax, %r10
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT: andq 344(%rdi), %r11
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andq 88(%rdi), %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 472(%rdi), %rax
+; AVX2-NEXT: orq %r11, %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX2-NEXT: andq 216(%rdi), %rbx
+; AVX2-NEXT: orq %rax, %rbx
+; AVX2-NEXT: orq %r8, %rbx
+; AVX2-NEXT: orq %r10, %rbx
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andq 312(%rdi), %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 56(%rdi), %rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT: andq 440(%rdi), %r10
+; AVX2-NEXT: orq %r8, %rax
+; AVX2-NEXT: movq %rax, %r11
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andq 184(%rdi), %r8
+; AVX2-NEXT: orq %r10, %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT: andq 376(%rdi), %r10
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 120(%rdi), %rax
+; AVX2-NEXT: orq %r11, %r8
+; AVX2-NEXT: movq %r8, %r11
+; AVX2-NEXT: orq %r10, %rax
+; AVX2-NEXT: movq %rax, %r10
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andq 504(%rdi), %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 248(%rdi), %rax
+; AVX2-NEXT: orq %r8, %rax
+; AVX2-NEXT: orq %r10, %rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r8, %r10
+; AVX2-NEXT: orq %r11, %rax
+; AVX2-NEXT: movq 1040(%rsp,%rsi), %rsi
+; AVX2-NEXT: orq %rbx, %rax
+; AVX2-NEXT: movq %rax, %r8
+; AVX2-NEXT: shlxq %rcx, %rsi, %rax
+; AVX2-NEXT: andq 256(%rdi), %r10
+; AVX2-NEXT: andq (%rdi), %rax
+; AVX2-NEXT: orq %r10, %rax
+; AVX2-NEXT: orq %r15, %rax
+; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; AVX2-NEXT: orq %r13, %rax
+; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
+; AVX2-NEXT: shldq %cl, %rsi, %rdx
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT: andq 264(%rdi), %rcx
+; AVX2-NEXT: andq 8(%rdi), %rdx
+; AVX2-NEXT: orq %r9, %rax
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: orq %rbp, %rdx
+; AVX2-NEXT: orq %r12, %rdx
+; AVX2-NEXT: orq %r14, %rdx
+; AVX2-NEXT: orq %r8, %rdx
+; AVX2-NEXT: orq %rax, %rdx
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: addq $1560, %rsp # imm = 0x618
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_ne_i4096:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: pushq %r15
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: pushq %r13
+; AVX512-NEXT: pushq %r12
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: subq $1560, %rsp # imm = 0x618
+; AVX512-NEXT: movl %esi, %ecx
+; AVX512-NEXT: movl %esi, %eax
+; AVX512-NEXT: andl $4032, %eax # imm = 0xFC0
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: andl $63, %ecx
+; AVX512-NEXT: shrl $3, %eax
+; AVX512-NEXT: negl %eax
+; AVX512-NEXT: movslq %eax, %rsi
+; AVX512-NEXT: movq 1280(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1288(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1536(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1544(%rsp,%rsi), %rax
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1152(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1160(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1408(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1416(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1216(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, (%rsp) # 8-byte Spill
+; AVX512-NEXT: movq 1224(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1472(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1480(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1088(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1096(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1344(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1352(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1248(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1256(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1504(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1512(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1120(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1128(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1376(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1384(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1184(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1192(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1440(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1448(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1056(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1064(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1312(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1320(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1264(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1272(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1520(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1528(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1136(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1144(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1392(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1400(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1200(%rsp,%rsi), %r10
+; AVX512-NEXT: movq 1208(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %r10, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1456(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1464(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1072(%rsp,%rsi), %r14
+; AVX512-NEXT: movq 1080(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %r14, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1328(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1336(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1232(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1240(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rax, %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1488(%rsp,%rsi), %r12
+; AVX512-NEXT: movq 1496(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %r12, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1104(%rsp,%rsi), %rax
+; AVX512-NEXT: movq 1112(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rax, %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1360(%rsp,%rsi), %r11
+; AVX512-NEXT: movq 1368(%rsp,%rsi), %rbx
+; AVX512-NEXT: movq %rbx, %rdx
+; AVX512-NEXT: shldq %cl, %r11, %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1168(%rsp,%rsi), %r9
+; AVX512-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1176(%rsp,%rsi), %r8
+; AVX512-NEXT: movq %r8, %rdx
+; AVX512-NEXT: shldq %cl, %r9, %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1424(%rsp,%rsi), %r9
+; AVX512-NEXT: movq 1432(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, %r15
+; AVX512-NEXT: shldq %cl, %r9, %r15
+; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1296(%rsp,%rsi), %rbp
+; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1304(%rsp,%rsi), %r15
+; AVX512-NEXT: movq %r15, %r13
+; AVX512-NEXT: shldq %cl, %rbp, %r13
+; AVX512-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r13, (%rsp) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq 1048(%rsp,%rsi), %rdx
+; AVX512-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r15, %rbx
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r15, %r10
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r15, %r14
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r15, %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r15, %r13
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r15, %r12
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r15, %rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r15, %r11
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %rbp, %r15
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %rbp, %r9
+; AVX512-NEXT: andq 384(%rdi), %r9
+; AVX512-NEXT: andq 128(%rdi), %r15
+; AVX512-NEXT: orq %r9, %r15
+; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: andq 320(%rdi), %r11
+; AVX512-NEXT: andq 64(%rdi), %rax
+; AVX512-NEXT: orq %r11, %rax
+; AVX512-NEXT: andq 448(%rdi), %r12
+; AVX512-NEXT: andq 192(%rdi), %r13
+; AVX512-NEXT: orq %r12, %r13
+; AVX512-NEXT: orq %rax, %r13
+; AVX512-NEXT: andq 288(%rdi), %r8
+; AVX512-NEXT: andq 32(%rdi), %r14
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 416(%rdi), %rax
+; AVX512-NEXT: orq %r8, %r14
+; AVX512-NEXT: andq 160(%rdi), %r10
+; AVX512-NEXT: orq %rax, %r10
+; AVX512-NEXT: andq 352(%rdi), %rbx
+; AVX512-NEXT: orq %r14, %r10
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 96(%rdi), %rax
+; AVX512-NEXT: orq %rbx, %rax
+; AVX512-NEXT: movq %rax, %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 480(%rdi), %rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT: andq 224(%rdi), %r15
+; AVX512-NEXT: orq %rax, %r15
+; AVX512-NEXT: orq %r8, %r15
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: andq 272(%rdi), %r8
+; AVX512-NEXT: orq %r10, %r15
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 16(%rdi), %rax
+; AVX512-NEXT: orq %r8, %rax
+; AVX512-NEXT: movq %rax, %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX512-NEXT: andq 400(%rdi), %r9
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 144(%rdi), %rax
+; AVX512-NEXT: orq %r9, %rax
+; AVX512-NEXT: orq %r8, %rax
+; AVX512-NEXT: movq %rax, %r9
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT: andq 336(%rdi), %r10
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 80(%rdi), %rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: andq 464(%rdi), %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX512-NEXT: andq 208(%rdi), %r11
+; AVX512-NEXT: orq %r10, %rax
+; AVX512-NEXT: orq %r8, %r11
+; AVX512-NEXT: orq %rax, %r11
+; AVX512-NEXT: orq %r9, %r11
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT: andq 304(%rdi), %r10
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 48(%rdi), %rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX512-NEXT: andq 432(%rdi), %r9
+; AVX512-NEXT: movq (%rsp), %r8 # 8-byte Reload
+; AVX512-NEXT: andq 176(%rdi), %r8
+; AVX512-NEXT: orq %r10, %rax
+; AVX512-NEXT: movq %rax, %r10
+; AVX512-NEXT: orq %r9, %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX512-NEXT: andq 368(%rdi), %r9
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 112(%rdi), %rax
+; AVX512-NEXT: orq %r10, %r8
+; AVX512-NEXT: movq %r8, %r10
+; AVX512-NEXT: orq %r9, %rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: andq 496(%rdi), %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX512-NEXT: andq 240(%rdi), %r9
+; AVX512-NEXT: orq %r8, %r9
+; AVX512-NEXT: orq %rax, %r9
+; AVX512-NEXT: orq %r10, %r9
+; AVX512-NEXT: orq %r11, %r9
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT: andq 392(%rdi), %r10
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX512-NEXT: andq 136(%rdi), %rbp
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: andq 328(%rdi), %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 72(%rdi), %rax
+; AVX512-NEXT: orq %r10, %rbp
+; AVX512-NEXT: orq %r8, %rax
+; AVX512-NEXT: movq %rax, %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 456(%rdi), %rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; AVX512-NEXT: andq 200(%rdi), %r12
+; AVX512-NEXT: orq %rax, %r12
+; AVX512-NEXT: orq %r8, %r12
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: andq 296(%rdi), %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 40(%rdi), %rax
+; AVX512-NEXT: orq %r8, %rax
+; AVX512-NEXT: movq %rax, %r10
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: andq 424(%rdi), %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 168(%rdi), %rax
+; AVX512-NEXT: orq %r8, %rax
+; AVX512-NEXT: orq %r10, %rax
+; AVX512-NEXT: movq %rax, %r10
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: andq 360(%rdi), %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 104(%rdi), %rax
+; AVX512-NEXT: orq %r8, %rax
+; AVX512-NEXT: movq %rax, %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 488(%rdi), %rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX512-NEXT: andq 232(%rdi), %r14
+; AVX512-NEXT: orq %rax, %r14
+; AVX512-NEXT: orq %r8, %r14
+; AVX512-NEXT: orq %r10, %r14
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: andq 280(%rdi), %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 24(%rdi), %rax
+; AVX512-NEXT: orq %r8, %rax
+; AVX512-NEXT: movq %rax, %r10
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: andq 408(%rdi), %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 152(%rdi), %rax
+; AVX512-NEXT: orq %r8, %rax
+; AVX512-NEXT: orq %r10, %rax
+; AVX512-NEXT: movq %rax, %r10
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX512-NEXT: andq 344(%rdi), %r11
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: andq 88(%rdi), %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 472(%rdi), %rax
+; AVX512-NEXT: orq %r11, %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX512-NEXT: andq 216(%rdi), %rbx
+; AVX512-NEXT: orq %rax, %rbx
+; AVX512-NEXT: orq %r8, %rbx
+; AVX512-NEXT: orq %r10, %rbx
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT: andq 312(%rdi), %r10
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 56(%rdi), %rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: andq 440(%rdi), %r8
+; AVX512-NEXT: orq %r10, %rax
+; AVX512-NEXT: movq %rax, %r10
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 184(%rdi), %rax
+; AVX512-NEXT: orq %r8, %rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: andq 376(%rdi), %r8
+; AVX512-NEXT: orq %r10, %rax
+; AVX512-NEXT: movq %rax, %r11
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 120(%rdi), %rax
+; AVX512-NEXT: orq %r8, %rax
+; AVX512-NEXT: movq %rax, %r10
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 504(%rdi), %rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: andq 248(%rdi), %r8
+; AVX512-NEXT: orq %rax, %r8
+; AVX512-NEXT: orq %r10, %r8
+; AVX512-NEXT: orq %r11, %r8
+; AVX512-NEXT: movq 1040(%rsp,%rsi), %rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %rsi, %r10
+; AVX512-NEXT: orq %rbx, %r8
+; AVX512-NEXT: shlxq %rcx, %rax, %rsi
+; AVX512-NEXT: andq 256(%rdi), %r10
+; AVX512-NEXT: andq (%rdi), %rsi
+; AVX512-NEXT: orq %r10, %rsi
+; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; AVX512-NEXT: orq %r13, %rsi
+; AVX512-NEXT: orq %r15, %rsi
+; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
+; AVX512-NEXT: shldq %cl, %rax, %rdx
+; AVX512-NEXT: orq %r9, %rsi
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 264(%rdi), %rax
+; AVX512-NEXT: andq 8(%rdi), %rdx
+; AVX512-NEXT: orq %rax, %rdx
+; AVX512-NEXT: orq %rbp, %rdx
+; AVX512-NEXT: orq %r12, %rdx
+; AVX512-NEXT: orq %r14, %rdx
+; AVX512-NEXT: orq %r8, %rdx
+; AVX512-NEXT: orq %rsi, %rdx
+; AVX512-NEXT: setne %al
+; AVX512-NEXT: addq $1560, %rsp # imm = 0x618
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r12
+; AVX512-NEXT: popq %r13
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %r15
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%rem = and i32 %position, 4095
%ofs = zext nneg i32 %rem to i4096
%bit = shl nuw i4096 1, %ofs
@@ -1812,8 +7161,8 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: subl $80, %esp
+; X86-NEXT: movzbl 12(%ebp), %ecx
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -1826,41 +7175,52 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
; X86-NEXT: shrb $3, %al
; X86-NEXT: andb $12, %al
; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %edi
-; X86-NEXT: movl 36(%esp,%edi), %edx
-; X86-NEXT: movl 40(%esp,%edi), %ebx
-; X86-NEXT: movl %ebx, %esi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl 32(%esp,%edi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%esp,%edi), %edi
-; X86-NEXT: shldl %cl, %ebx, %edi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %ebx
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 16(%ebp), %eax
-; X86-NEXT: movl (%eax), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%ebp), %eax
-; X86-NEXT: andl $96, %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: movl (%ecx,%eax), %eax
-; X86-NEXT: andl %ebx, (%ecx)
-; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 56(%esp,%eax), %esi
+; X86-NEXT: movl 60(%esp,%eax), %edx
+; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%esp,%eax), %edi
+; X86-NEXT: movl 52(%esp,%eax), %eax
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %eax
+; X86-NEXT: movl 8(%ebp), %ebx
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movl 8(%ebx), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %esi, %ecx
+; X86-NEXT: movl (%ebx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edi, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: movl 12(%ebx), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl 4(%ebx), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ebx, %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: notl %ecx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl %cl, %ebx, %edx
+; X86-NEXT: notl %ebx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: notl %edx
-; X86-NEXT: movl 8(%ebp), %ebx
-; X86-NEXT: andl %edx, 4(%ebx)
-; X86-NEXT: notl %esi
-; X86-NEXT: andl %esi, 8(%ebx)
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: notl %edi
-; X86-NEXT: andl %edi, 12(%ebx)
-; X86-NEXT: btl %ecx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: jae .LBB22_2
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movl (%eax), %eax
+; X86-NEXT: movl 8(%ebp), %esi
+; X86-NEXT: movl %ebx, 8(%esi)
+; X86-NEXT: movl %ecx, 12(%esi)
+; X86-NEXT: movl %edi, (%esi)
+; X86-NEXT: movl %edx, 4(%esi)
+; X86-NEXT: je .LBB22_2
; X86-NEXT: # %bb.1:
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: .LBB22_2:
@@ -1882,75 +7242,52 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
; SSE-NEXT: testb $64, %cl
; SSE-NEXT: cmovneq %rsi, %r8
; SSE-NEXT: cmovneq %rax, %rsi
+; SSE-NEXT: movq (%rdi), %rcx
+; SSE-NEXT: movq 8(%rdi), %r9
+; SSE-NEXT: movq %r9, %r10
+; SSE-NEXT: andq %r8, %r10
; SSE-NEXT: notq %r8
+; SSE-NEXT: movq %rcx, %r11
+; SSE-NEXT: andq %rsi, %r11
; SSE-NEXT: notq %rsi
-; SSE-NEXT: movl %ecx, %r9d
-; SSE-NEXT: andl $96, %r9d
-; SSE-NEXT: shrl $3, %r9d
-; SSE-NEXT: movl (%rdi,%r9), %r9d
-; SSE-NEXT: btl %ecx, %r9d
-; SSE-NEXT: jb .LBB22_2
+; SSE-NEXT: andq %r9, %r8
+; SSE-NEXT: andq %rcx, %rsi
+; SSE-NEXT: orq %r10, %r11
+; SSE-NEXT: jne .LBB22_2
; SSE-NEXT: # %bb.1:
; SSE-NEXT: movl (%rdx), %eax
; SSE-NEXT: .LBB22_2:
-; SSE-NEXT: andq %r8, 8(%rdi)
-; SSE-NEXT: andq %rsi, (%rdi)
+; SSE-NEXT: movq %rsi, (%rdi)
+; SSE-NEXT: movq %r8, 8(%rdi)
; SSE-NEXT: # kill: def $eax killed $eax killed $rax
; SSE-NEXT: retq
;
-; AVX2-LABEL: reset_multiload_i128:
-; AVX2: # %bb.0:
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: movl $1, %r8d
-; AVX2-NEXT: xorl %esi, %esi
-; AVX2-NEXT: shldq %cl, %r8, %rsi
-; AVX2-NEXT: shlxq %rcx, %r8, %r8
-; AVX2-NEXT: testb $64, %cl
-; AVX2-NEXT: cmovneq %r8, %rsi
-; AVX2-NEXT: cmovneq %rax, %r8
-; AVX2-NEXT: notq %rsi
-; AVX2-NEXT: notq %r8
-; AVX2-NEXT: movl %ecx, %r9d
-; AVX2-NEXT: andl $96, %r9d
-; AVX2-NEXT: shrl $3, %r9d
-; AVX2-NEXT: movl (%rdi,%r9), %r9d
-; AVX2-NEXT: btl %ecx, %r9d
-; AVX2-NEXT: jb .LBB22_2
-; AVX2-NEXT: # %bb.1:
-; AVX2-NEXT: movl (%rdx), %eax
-; AVX2-NEXT: .LBB22_2:
-; AVX2-NEXT: andq %rsi, 8(%rdi)
-; AVX2-NEXT: andq %r8, (%rdi)
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: reset_multiload_i128:
-; AVX512: # %bb.0:
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: movl $1, %r8d
-; AVX512-NEXT: xorl %esi, %esi
-; AVX512-NEXT: shldq %cl, %r8, %rsi
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: shlxq %rcx, %r8, %r8
-; AVX512-NEXT: testb $64, %cl
-; AVX512-NEXT: cmovneq %r8, %rsi
-; AVX512-NEXT: cmovneq %rax, %r8
-; AVX512-NEXT: notq %rsi
-; AVX512-NEXT: notq %r8
-; AVX512-NEXT: movl %ecx, %r9d
-; AVX512-NEXT: andl $96, %r9d
-; AVX512-NEXT: shrl $3, %r9d
-; AVX512-NEXT: movl (%rdi,%r9), %r9d
-; AVX512-NEXT: btl %ecx, %r9d
-; AVX512-NEXT: jb .LBB22_2
-; AVX512-NEXT: # %bb.1:
-; AVX512-NEXT: movl (%rdx), %eax
-; AVX512-NEXT: .LBB22_2:
-; AVX512-NEXT: andq %rsi, 8(%rdi)
-; AVX512-NEXT: andq %r8, (%rdi)
-; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT: retq
+; AVX-LABEL: reset_multiload_i128:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %esi, %ecx
+; AVX-NEXT: movl $1, %esi
+; AVX-NEXT: xorl %r8d, %r8d
+; AVX-NEXT: shldq %cl, %rsi, %r8
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: shlxq %rcx, %rsi, %r9
+; AVX-NEXT: testb $64, %cl
+; AVX-NEXT: cmovneq %r9, %r8
+; AVX-NEXT: cmovneq %rax, %r9
+; AVX-NEXT: movq (%rdi), %r10
+; AVX-NEXT: movq 8(%rdi), %r11
+; AVX-NEXT: andnq %r11, %r8, %rcx
+; AVX-NEXT: andq %r8, %r11
+; AVX-NEXT: andnq %r10, %r9, %rsi
+; AVX-NEXT: andq %r9, %r10
+; AVX-NEXT: orq %r11, %r10
+; AVX-NEXT: jne .LBB22_2
+; AVX-NEXT: # %bb.1:
+; AVX-NEXT: movl (%rdx), %eax
+; AVX-NEXT: .LBB22_2:
+; AVX-NEXT: movq %rsi, (%rdi)
+; AVX-NEXT: movq %rcx, 8(%rdi)
+; AVX-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
diff --git a/llvm/test/CodeGen/X86/ipra-reg-usage.ll b/llvm/test/CodeGen/X86/ipra-reg-usage.ll
index e73ff79..f270f8f 100644
--- a/llvm/test/CodeGen/X86/ipra-reg-usage.ll
+++ b/llvm/test/CodeGen/X86/ipra-reg-usage.ll
@@ -7,7 +7,7 @@
target triple = "x86_64-unknown-unknown"
declare void @bar1()
define preserve_allcc void @foo()#0 {
-; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hip $hsp $ip $mxcsr $rflags $rip $riz $rsp $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $r11b $r11bh $r11d $r11w $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $tmm0_tmm1 $tmm2_tmm3 $tmm4_tmm5 $tmm6_tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh
+; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hip $hsp $ip $mxcsr $rflags $rip $riz $rsp $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $r11b $r11bh $r11d $r11w $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh
call void @bar1()
call void @bar2()
ret void
@@ -15,7 +15,7 @@ define preserve_allcc void @foo()#0 {
declare void @bar2()
define preserve_nonecc void @foo2()#0 {
-; CHECK: foo2 Clobbered Registers: $ah $al $ax $ch $cl $cs $cx $df $dh $di $dih $dil $dl $ds $dx $eax $ecx $edi $edx $eflags $eip $eiz $es $esi $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hax $hcx $hdi $hdx $hip $hsi $hsp $ip $mxcsr $rax $rcx $rdi $rdx $rflags $rip $riz $rsi $rsp $si $sih $sil $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r8 $r9 $r10 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $xmm0 $xmm1 $xmm2 $xmm3 $xmm4 $xmm5 $xmm6 $xmm7 $xmm8 $xmm9 $xmm10 $xmm11 $xmm12 $xmm13 $xmm14 $xmm15 $r8b $r9b $r10b $r11b $r8bh $r9bh $r10bh $r11bh $r8d $r9d $r10d $r11d $r8w $r9w $r10w $r11w $r8wh $r9wh $r10wh $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $tmm0_tmm1 $tmm2_tmm3 $tmm4_tmm5 $tmm6_tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh
+; CHECK: foo2 Clobbered Registers: $ah $al $ax $ch $cl $cs $cx $df $dh $di $dih $dil $dl $ds $dx $eax $ecx $edi $edx $eflags $eip $eiz $es $esi $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hax $hcx $hdi $hdx $hip $hsi $hsp $ip $mxcsr $rax $rcx $rdi $rdx $rflags $rip $riz $rsi $rsp $si $sih $sil $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r8 $r9 $r10 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $xmm0 $xmm1 $xmm2 $xmm3 $xmm4 $xmm5 $xmm6 $xmm7 $xmm8 $xmm9 $xmm10 $xmm11 $xmm12 $xmm13 $xmm14 $xmm15 $r8b $r9b $r10b $r11b $r8bh $r9bh $r10bh $r11bh $r8d $r9d $r10d $r11d $r8w $r9w $r10w $r11w $r8wh $r9wh $r10wh $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh
call void @bar1()
call void @bar2()
ret void
diff --git a/llvm/test/CodeGen/X86/pr165755.ll b/llvm/test/CodeGen/X86/pr165755.ll
new file mode 100644
index 0000000..3ab484f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr165755.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefixes=X64
+
+define i32 @PR165755(ptr %p0) {
+; X86-LABEL: PR165755:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl (%ecx), %eax
+; X86-NEXT: movb $0, (%ecx)
+; X86-NEXT: retl
+;
+; X64-LABEL: PR165755:
+; X64: # %bb.0:
+; X64-NEXT: movl (%rdi), %eax
+; X64-NEXT: movb $0, (%rdi)
+; X64-NEXT: retq
+ %ld64 = load i64, ptr %p0, align 8
+ store i8 0, ptr %p0, align 1
+ %ld32 = load i32, ptr %p0, align 8
+ %mask = and i32 %ld32, 32
+ %zext = zext i32 %mask to i64
+ %srl = lshr i64 %ld64, %zext
+ %res = trunc i64 %srl to i32
+ ret i32 %res
+}