diff options
Diffstat (limited to 'llvm/test/CodeGen')
48 files changed, 8847 insertions, 2926 deletions
diff --git a/llvm/test/CodeGen/AArch64/stackmap.ll b/llvm/test/CodeGen/AArch64/stackmap.ll index 995d254..26221d0 100644 --- a/llvm/test/CodeGen/AArch64/stackmap.ll +++ b/llvm/test/CodeGen/AArch64/stackmap.ll @@ -81,14 +81,14 @@ ; CHECK-NEXT: .hword 8 ; CHECK-NEXT: .hword 0 ; CHECK-NEXT: .hword 0 -; CHECK-NEXT: .word 65535 +; CHECK-NEXT: .word -1 ; SmallConstant ; CHECK-NEXT: .byte 4 ; CHECK-NEXT: .byte 0 ; CHECK-NEXT: .hword 8 ; CHECK-NEXT: .hword 0 ; CHECK-NEXT: .hword 0 -; CHECK-NEXT: .word 65535 +; CHECK-NEXT: .word -1 ; SmallConstant ; CHECK-NEXT: .byte 4 ; CHECK-NEXT: .byte 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll new file mode 100644 index 0000000..e440bee --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll @@ -0,0 +1,165 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-FAKE16 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-TRUE16 %s + +define amdgpu_ps half @fadd_s16_uniform(half inreg %a, half inreg %b) { +; GFX11-FAKE16-LABEL: fadd_s16_uniform: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s0, s1 +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: fadd_s16_uniform: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s0, s1 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: fadd_s16_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_add_f16 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: ; return to shader part epilog + %fadd = fadd half %a, %b + ret half %fadd +} + +define amdgpu_ps half @fadd_s16_div(half %a, half %b) { +; GFX11-FAKE16-LABEL: fadd_s16_div: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: fadd_s16_div: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-FAKE16-LABEL: fadd_s16_div: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX12-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-TRUE16-LABEL: fadd_s16_div: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX12-TRUE16-NEXT: ; return to shader part epilog + %fadd = fadd half %a, %b + ret half %fadd +} + +define amdgpu_ps float @fadd_s32_uniform(float inreg %a, float inreg %b) { +; GFX11-LABEL: fadd_s32_uniform: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f32_e64 v0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: fadd_s32_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_add_f32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: ; return to shader part epilog + %fadd = fadd float %a, %b + ret float %fadd +} + +define amdgpu_ps float @fadd_s32_div(float %a, float %b) { +; GCN-LABEL: fadd_s32_div: +; GCN: ; %bb.0: +; GCN-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-NEXT: ; return to shader part epilog + %fadd = fadd float %a, %b + ret float %fadd +} + +define amdgpu_ps void @fadd_s64_uniform(double inreg %a, double inreg %b, ptr addrspace(1) %ptr) { +; GFX11-LABEL: fadd_s64_uniform: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f64 v[2:3], s[0:1], s[2:3] +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fadd_s64_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_add_f64_e64 v[2:3], s[0:1], s[2:3] +; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX12-NEXT: s_endpgm + %fadd = fadd double %a, %b + store double %fadd, ptr addrspace(1) %ptr + ret void +} + +define amdgpu_ps void @fadd_s64_div(double %a, double %b, ptr addrspace(1) %ptr) { +; GFX11-LABEL: fadd_s64_div: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: global_store_b64 v[4:5], v[0:1], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fadd_s64_div: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: global_store_b64 v[4:5], v[0:1], off +; GFX12-NEXT: s_endpgm + %fadd = fadd double %a, %b + store double %fadd, ptr addrspace(1) %ptr + ret void +} + +define amdgpu_ps <2 x half> @fadd_v2s16_uniform(<2 x half> inreg %a, <2 x half> inreg %b) { +; GFX11-LABEL: fadd_v2s16_uniform: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_pk_add_f16 v0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: fadd_v2s16_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_lshr_b32 s2, s0, 16 +; GFX12-NEXT: s_lshr_b32 s3, s1, 16 +; GFX12-NEXT: s_add_f16 s0, s0, s1 +; GFX12-NEXT: s_add_f16 s1, s2, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: ; return to shader part epilog + %fadd = fadd <2 x half> %a, %b + ret <2 x half> %fadd +} + +define amdgpu_ps <2 x half> @fadd_v2s16_div(<2 x half> %a, <2 x half> %b) { +; GCN-LABEL: fadd_v2s16_div: +; GCN: ; %bb.0: +; GCN-NEXT: v_pk_add_f16 v0, v0, v1 +; GCN-NEXT: ; return to shader part epilog + %fadd = fadd <2 x half> %a, %b + ret <2 x half> %fadd +} + +define amdgpu_ps <2 x float> @fadd_v2s32_uniform(<2 x float> inreg %a, <2 x float> inreg %b) { +; GFX11-LABEL: fadd_v2s32_uniform: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f32_e64 v0, s0, s2 +; GFX11-NEXT: v_add_f32_e64 v1, s1, s3 +; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: fadd_v2s32_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_add_f32 s0, s0, s2 +; GFX12-NEXT: s_add_f32 s1, s1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: ; return to shader part epilog + %fadd = fadd <2 x float> %a, %b + ret <2 x float> %fadd +} + +define amdgpu_ps <2 x float> @fadd_v2s32_div(<2 x float> %a, <2 x float> %b) { +; GCN-LABEL: fadd_v2s32_div: +; GCN: ; %bb.0: +; GCN-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3 +; GCN-NEXT: ; return to shader part epilog + %fadd = fadd <2 x float> %a, %b + ret <2 x float> %fadd +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll index 1a7ccf0..588802c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define amdgpu_kernel void @fcmp_uniform_select(float %a, i32 %b, i32 %c, ptr addrspace(1) %out) { ; GFX7-LABEL: fcmp_uniform_select: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir index 67cc016..b6652f6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy-scc-vcc.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx700 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX7 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GF8 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx700 -run-pass=instruction-select %s -o - | FileCheck -check-prefixes=GFX7 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass=instruction-select %s -o - | FileCheck -check-prefixes=GF8 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select %s -o - | FileCheck -check-prefixes=GFX11 %s --- name: test_copy_scc_vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll index 02d0e52..6facdfd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll @@ -104,109 +104,110 @@ define amdgpu_cs <4 x i32> @abs_sgpr_v4i32(<4 x i32> inreg %arg) { ret <4 x i32> %res } -define amdgpu_cs i16 @abs_vgpr_i16(i16 %arg) { +define i16 @abs_vgpr_i16(i16 %arg) { ; GFX6-LABEL: abs_vgpr_i16: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0 ; GFX6-NEXT: v_max_i32_e32 v0, v0, v1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: ; return to shader part epilog +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: abs_vgpr_i16: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u16_e32 v1, 0, v0 ; GFX8-NEXT: v_max_i16_e32 v0, v0, v1 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: ; return to shader part epilog +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: abs_vgpr_i16: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_sub_nc_u16 v1, 0, v0 ; GFX10-NEXT: v_max_i16 v0, v0, v1 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: abs_vgpr_i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_sub_nc_u16 v1, 0, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_max_i16 v0, v0, v1 -; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %res = call i16 @llvm.abs.i16(i16 %arg, i1 false) ret i16 %res } -define amdgpu_cs i32 @abs_vgpr_i32(i32 %arg) { +define i32 @abs_vgpr_i32(i32 %arg) { ; GFX6-LABEL: abs_vgpr_i32: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0 ; GFX6-NEXT: v_max_i32_e32 v0, v0, v1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: ; return to shader part epilog +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: abs_vgpr_i32: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 0, v0 ; GFX8-NEXT: v_max_i32_e32 v0, v0, v1 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: ; return to shader part epilog +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: abs_vgpr_i32: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_max_i32_e32 v0, v0, v1 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: abs_vgpr_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_sub_nc_u32_e32 v1, 0, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_max_i32_e32 v0, v0, v1 -; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %res = call i32 @llvm.abs.i32(i32 %arg, i1 false) ret i32 %res } -define amdgpu_cs i64 @abs_vgpr_i64(i64 %arg) { +define i64 @abs_vgpr_i64(i64 %arg) { ; GFX6-LABEL: abs_vgpr_i64: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_xor_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 -; GFX6-NEXT: ; return to shader part epilog +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: abs_vgpr_i64: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc ; GFX8-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_xor_b32_e32 v1, v1, v2 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 -; GFX8-NEXT: ; return to shader part epilog +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: abs_vgpr_i64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo ; GFX10-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_xor_b32_e32 v1, v1, v2 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: abs_vgpr_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_mov_b32_e32 v3, v2 @@ -214,17 +215,15 @@ define amdgpu_cs i64 @abs_vgpr_i64(i64 %arg) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX1250-NEXT: v_xor_b32_e32 v1, v1, v2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1250-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %res = call i64 @llvm.abs.i64(i64 %arg, i1 false) ret i64 %res } -define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) { +define <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) { ; GFX6-LABEL: abs_vgpr_v4i32: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v0 ; GFX6-NEXT: v_max_i32_e32 v0, v0, v4 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 @@ -233,14 +232,11 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) { ; GFX6-NEXT: v_max_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 ; GFX6-NEXT: v_max_i32_e32 v3, v3, v4 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v2 -; GFX6-NEXT: v_readfirstlane_b32 s3, v3 -; GFX6-NEXT: ; return to shader part epilog +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: abs_vgpr_v4i32: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v0 ; GFX8-NEXT: v_max_i32_e32 v0, v0, v4 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v1 @@ -249,14 +245,11 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) { ; GFX8-NEXT: v_max_i32_e32 v2, v2, v4 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v3 ; GFX8-NEXT: v_max_i32_e32 v3, v3, v4 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 -; GFX8-NEXT: v_readfirstlane_b32 s2, v2 -; GFX8-NEXT: v_readfirstlane_b32 s3, v3 -; GFX8-NEXT: ; return to shader part epilog +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: abs_vgpr_v4i32: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_sub_nc_u32_e32 v4, 0, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v5, 0, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, 0, v2 @@ -265,14 +258,12 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) { ; GFX10-NEXT: v_max_i32_e32 v1, v1, v5 ; GFX10-NEXT: v_max_i32_e32 v2, v2, v6 ; GFX10-NEXT: v_max_i32_e32 v3, v3, v7 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: v_readfirstlane_b32 s2, v2 -; GFX10-NEXT: v_readfirstlane_b32 s3, v3 -; GFX10-NEXT: ; return to shader part epilog +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: abs_vgpr_v4i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_sub_nc_u32 v4, 0, v0 :: v_dual_sub_nc_u32 v5, 0, v1 ; GFX1250-NEXT: v_dual_sub_nc_u32 v6, 0, v2 :: v_dual_sub_nc_u32 v7, 0, v3 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) @@ -281,13 +272,7 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1250-NEXT: v_max_i32_e32 v2, v2, v6 ; GFX1250-NEXT: v_max_i32_e32 v3, v3, v7 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1250-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_readfirstlane_b32 s2, v2 -; GFX1250-NEXT: v_readfirstlane_b32 s3, v3 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %arg, i1 false) ret <4 x i32> %res } @@ -304,44 +289,43 @@ define amdgpu_cs <2 x i8> @abs_sgpr_v2i8(<2 x i8> inreg %arg) { ret <2 x i8> %res } -define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) { +define <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) { ; GFX6-LABEL: abs_vgpr_v2i8: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX6-NEXT: v_max_i32_e32 v0, v0, v2 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1 ; GFX6-NEXT: v_max_i32_e32 v1, v1, v2 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 -; GFX6-NEXT: ; return to shader part epilog +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: abs_vgpr_v2i8: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_sub_u16_sdwa v3, v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_sub_u16_sdwa v2, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_max_i16_sdwa v0, sext(v0), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_max_i16_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 -; GFX8-NEXT: ; return to shader part epilog +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: abs_vgpr_v2i8: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX10-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX10-NEXT: v_sub_nc_u16 v2, 0, v0 ; GFX10-NEXT: v_sub_nc_u16 v3, 0, v1 ; GFX10-NEXT: v_max_i16 v0, v0, v2 ; GFX10-NEXT: v_max_i16 v1, v1, v3 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: abs_vgpr_v2i8: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -350,10 +334,7 @@ define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_max_i16 v0, v0, v2 ; GFX1250-NEXT: v_max_i16 v1, v1, v3 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1250-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %res = call <2 x i8> @llvm.abs.v2i8(<2 x i8> %arg, i1 false) ret <2 x i8> %res } @@ -372,9 +353,10 @@ define amdgpu_cs <3 x i8> @abs_sgpr_v3i8(<3 x i8> inreg %arg) { ret <3 x i8> %res } -define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) { +define <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) { ; GFX6-LABEL: abs_vgpr_v3i8: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8 @@ -384,13 +366,11 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) { ; GFX6-NEXT: v_max_i32_e32 v1, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 ; GFX6-NEXT: v_max_i32_e32 v2, v2, v3 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v2 -; GFX6-NEXT: ; return to shader part epilog +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: abs_vgpr_v3i8: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_sub_u16_sdwa v4, v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_max_i16_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -398,13 +378,11 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) { ; GFX8-NEXT: v_sub_u16_sdwa v3, v3, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_max_i16_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_max_i16_sdwa v2, sext(v2), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 -; GFX8-NEXT: v_readfirstlane_b32 s2, v2 -; GFX8-NEXT: ; return to shader part epilog +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: abs_vgpr_v3i8: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX10-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX10-NEXT: v_bfe_i32 v2, v2, 0, 8 @@ -414,13 +392,12 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) { ; GFX10-NEXT: v_max_i16 v0, v0, v3 ; GFX10-NEXT: v_max_i16 v1, v1, v4 ; GFX10-NEXT: v_max_i16 v2, v2, v5 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: v_readfirstlane_b32 s2, v2 -; GFX10-NEXT: ; return to shader part epilog +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: abs_vgpr_v3i8: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX1250-NEXT: v_bfe_i32 v2, v2, 0, 8 @@ -433,12 +410,7 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-NEXT: v_max_i16 v1, v1, v4 ; GFX1250-NEXT: v_max_i16 v2, v2, v5 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1250-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1250-NEXT: v_readfirstlane_b32 s2, v2 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %res = call <3 x i8> @llvm.abs.v3i8(<3 x i8> %arg, i1 false) ret <3 x i8> %res } @@ -485,44 +457,44 @@ define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) { ret <2 x i16> %res } -define amdgpu_cs <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) { +define <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) { ; GFX6-LABEL: abs_vgpr_v2i16: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_max_i32_e32 v0, v0, v2 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1 ; GFX6-NEXT: v_max_i32_e32 v1, v1, v2 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 -; GFX6-NEXT: ; return to shader part epilog +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: abs_vgpr_v2i16: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_sub_u16_e32 v1, 0, v0 ; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_max_i16_e32 v1, v0, v1 ; GFX8-NEXT: v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: ; return to shader part epilog +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: abs_vgpr_v2i16: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v1, 0, v0 ; GFX10-NEXT: v_pk_max_i16 v0, v0, v1 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: abs_vgpr_v2i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_pk_sub_i16 v1, 0, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_pk_max_i16 v0, v0, v1 -; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %arg, i1 false) ret <2 x i16> %res } @@ -576,9 +548,10 @@ define amdgpu_cs <3 x i16> @abs_sgpr_v3i16(<3 x i16> inreg %arg) { ret <3 x i16> %res } -define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) { +define <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) { ; GFX6-LABEL: abs_vgpr_v3i16: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 @@ -588,13 +561,11 @@ define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) { ; GFX6-NEXT: v_max_i32_e32 v1, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 ; GFX6-NEXT: v_max_i32_e32 v2, v2, v3 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v2 -; GFX6-NEXT: ; return to shader part epilog +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: abs_vgpr_v3i16: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_sub_u16_e32 v2, 0, v0 ; GFX8-NEXT: v_sub_u16_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -603,31 +574,27 @@ define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) { ; GFX8-NEXT: v_max_i16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_max_i16_e32 v1, v1, v4 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 -; GFX8-NEXT: ; return to shader part epilog +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: abs_vgpr_v3i16: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v2, 0, v0 ; GFX10-NEXT: v_sub_nc_u16 v3, 0, v1 ; GFX10-NEXT: v_pk_max_i16 v0, v0, v2 ; GFX10-NEXT: v_max_i16 v1, v1, v3 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: abs_vgpr_v3i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_pk_sub_i16 v2, 0, v0 ; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_pk_max_i16 v0, v0, v2 ; GFX1250-NEXT: v_max_i16 v1, v1, v3 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1250-NEXT: v_readfirstlane_b32 s1, v1 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %arg, i1 false) ret <3 x i16> %res } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll new file mode 100644 index 0000000..05a0e39 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll @@ -0,0 +1,325 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx1100 -passes=amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s + +define amdgpu_kernel void @large_array_vectors_small_users(<16 x i8> %in, <16 x i8> %add, ptr addrspace(3) %out) #0 { +; OPT-LABEL: define amdgpu_kernel void @large_array_vectors_small_users( +; OPT-SAME: <16 x i8> [[IN:%.*]], <16 x i8> [[ADD:%.*]], ptr addrspace(3) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; OPT-NEXT: [[ENTRY:.*:]] +; OPT-NEXT: [[ALLOCA:%.*]] = freeze <128 x i8> poison +; OPT-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[IN]], i64 0 +; OPT-NEXT: [[TMP1:%.*]] = insertelement <128 x i8> [[ALLOCA]], i8 [[TMP0]], i32 0 +; OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP3:%.*]] = insertelement <128 x i8> [[TMP1]], i8 [[TMP2]], i32 1 +; OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP5:%.*]] = insertelement <128 x i8> [[TMP3]], i8 [[TMP4]], i32 2 +; OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP7:%.*]] = insertelement <128 x i8> [[TMP5]], i8 [[TMP6]], i32 3 +; OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP9:%.*]] = insertelement <128 x i8> [[TMP7]], i8 [[TMP8]], i32 4 +; OPT-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP11:%.*]] = insertelement <128 x i8> [[TMP9]], i8 [[TMP10]], i32 5 +; OPT-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP13:%.*]] = insertelement <128 x i8> [[TMP11]], i8 [[TMP12]], i32 6 +; OPT-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP15:%.*]] = insertelement <128 x i8> [[TMP13]], i8 [[TMP14]], i32 7 +; OPT-NEXT: [[TMP16:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP17:%.*]] = insertelement <128 x i8> [[TMP15]], i8 [[TMP16]], i32 8 +; OPT-NEXT: [[TMP18:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP19:%.*]] = insertelement <128 x i8> [[TMP17]], i8 [[TMP18]], i32 9 +; OPT-NEXT: [[TMP20:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP21:%.*]] = insertelement <128 x i8> [[TMP19]], i8 [[TMP20]], i32 10 +; OPT-NEXT: [[TMP22:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP23:%.*]] = insertelement <128 x i8> [[TMP21]], i8 [[TMP22]], i32 11 +; OPT-NEXT: [[TMP24:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP25:%.*]] = insertelement <128 x i8> [[TMP23]], i8 [[TMP24]], i32 12 +; OPT-NEXT: [[TMP26:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP27:%.*]] = insertelement <128 x i8> [[TMP25]], i8 [[TMP26]], i32 13 +; OPT-NEXT: [[TMP28:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP29:%.*]] = insertelement <128 x i8> [[TMP27]], i8 [[TMP28]], i32 14 +; OPT-NEXT: [[TMP30:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP31:%.*]] = insertelement <128 x i8> [[TMP29]], i8 [[TMP30]], i32 15 +; OPT-NEXT: [[TMP32:%.*]] = extractelement <16 x i8> [[IN]], i64 0 +; OPT-NEXT: [[TMP33:%.*]] = insertelement <128 x i8> [[TMP31]], i8 [[TMP32]], i32 0 +; OPT-NEXT: [[TMP34:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP35:%.*]] = insertelement <128 x i8> [[TMP33]], i8 [[TMP34]], i32 1 +; OPT-NEXT: [[TMP36:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP37:%.*]] = insertelement <128 x i8> [[TMP35]], i8 [[TMP36]], i32 2 +; OPT-NEXT: [[TMP38:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP39:%.*]] = insertelement <128 x i8> [[TMP37]], i8 [[TMP38]], i32 3 +; OPT-NEXT: [[TMP40:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP41:%.*]] = insertelement <128 x i8> [[TMP39]], i8 [[TMP40]], i32 4 +; OPT-NEXT: [[TMP42:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP43:%.*]] = insertelement <128 x i8> [[TMP41]], i8 [[TMP42]], i32 5 +; OPT-NEXT: [[TMP44:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP45:%.*]] = insertelement <128 x i8> [[TMP43]], i8 [[TMP44]], i32 6 +; OPT-NEXT: [[TMP46:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP47:%.*]] = insertelement <128 x i8> [[TMP45]], i8 [[TMP46]], i32 7 +; OPT-NEXT: [[TMP48:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP49:%.*]] = insertelement <128 x i8> [[TMP47]], i8 [[TMP48]], i32 8 +; OPT-NEXT: [[TMP50:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP51:%.*]] = insertelement <128 x i8> [[TMP49]], i8 [[TMP50]], i32 9 +; OPT-NEXT: [[TMP52:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP53:%.*]] = insertelement <128 x i8> [[TMP51]], i8 [[TMP52]], i32 10 +; OPT-NEXT: [[TMP54:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP55:%.*]] = insertelement <128 x i8> [[TMP53]], i8 [[TMP54]], i32 11 +; OPT-NEXT: [[TMP56:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP57:%.*]] = insertelement <128 x i8> [[TMP55]], i8 [[TMP56]], i32 12 +; OPT-NEXT: [[TMP58:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP59:%.*]] = insertelement <128 x i8> [[TMP57]], i8 [[TMP58]], i32 13 +; OPT-NEXT: [[TMP60:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP61:%.*]] = insertelement <128 x i8> [[TMP59]], i8 [[TMP60]], i32 14 +; OPT-NEXT: [[TMP62:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP63:%.*]] = insertelement <128 x i8> [[TMP61]], i8 [[TMP62]], i32 15 +; OPT-NEXT: [[TMP64:%.*]] = extractelement <16 x i8> [[IN]], i64 0 +; OPT-NEXT: [[TMP65:%.*]] = insertelement <128 x i8> [[TMP63]], i8 [[TMP64]], i32 0 +; OPT-NEXT: [[TMP66:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP67:%.*]] = insertelement <128 x i8> [[TMP65]], i8 [[TMP66]], i32 1 +; OPT-NEXT: [[TMP68:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP69:%.*]] = insertelement <128 x i8> [[TMP67]], i8 [[TMP68]], i32 2 +; OPT-NEXT: [[TMP70:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP71:%.*]] = insertelement <128 x i8> [[TMP69]], i8 [[TMP70]], i32 3 +; OPT-NEXT: [[TMP72:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP73:%.*]] = insertelement <128 x i8> [[TMP71]], i8 [[TMP72]], i32 4 +; OPT-NEXT: [[TMP74:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP75:%.*]] = insertelement <128 x i8> [[TMP73]], i8 [[TMP74]], i32 5 +; OPT-NEXT: [[TMP76:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP77:%.*]] = insertelement <128 x i8> [[TMP75]], i8 [[TMP76]], i32 6 +; OPT-NEXT: [[TMP78:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP79:%.*]] = insertelement <128 x i8> [[TMP77]], i8 [[TMP78]], i32 7 +; OPT-NEXT: [[TMP80:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP81:%.*]] = insertelement <128 x i8> [[TMP79]], i8 [[TMP80]], i32 8 +; OPT-NEXT: [[TMP82:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP83:%.*]] = insertelement <128 x i8> [[TMP81]], i8 [[TMP82]], i32 9 +; OPT-NEXT: [[TMP84:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP85:%.*]] = insertelement <128 x i8> [[TMP83]], i8 [[TMP84]], i32 10 +; OPT-NEXT: [[TMP86:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP87:%.*]] = insertelement <128 x i8> [[TMP85]], i8 [[TMP86]], i32 11 +; OPT-NEXT: [[TMP88:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP89:%.*]] = insertelement <128 x i8> [[TMP87]], i8 [[TMP88]], i32 12 +; OPT-NEXT: [[TMP90:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP91:%.*]] = insertelement <128 x i8> [[TMP89]], i8 [[TMP90]], i32 13 +; OPT-NEXT: [[TMP92:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP93:%.*]] = insertelement <128 x i8> [[TMP91]], i8 [[TMP92]], i32 14 +; OPT-NEXT: [[TMP94:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP95:%.*]] = insertelement <128 x i8> [[TMP93]], i8 [[TMP94]], i32 15 +; OPT-NEXT: [[TMP96:%.*]] = extractelement <16 x i8> [[IN]], i64 0 +; OPT-NEXT: [[TMP97:%.*]] = insertelement <128 x i8> [[TMP95]], i8 [[TMP96]], i32 0 +; OPT-NEXT: [[TMP98:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP99:%.*]] = insertelement <128 x i8> [[TMP97]], i8 [[TMP98]], i32 1 +; OPT-NEXT: [[TMP100:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP101:%.*]] = insertelement <128 x i8> [[TMP99]], i8 [[TMP100]], i32 2 +; OPT-NEXT: [[TMP102:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP103:%.*]] = insertelement <128 x i8> [[TMP101]], i8 [[TMP102]], i32 3 +; OPT-NEXT: [[TMP104:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP105:%.*]] = insertelement <128 x i8> [[TMP103]], i8 [[TMP104]], i32 4 +; OPT-NEXT: [[TMP106:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP107:%.*]] = insertelement <128 x i8> [[TMP105]], i8 [[TMP106]], i32 5 +; OPT-NEXT: [[TMP108:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP109:%.*]] = insertelement <128 x i8> [[TMP107]], i8 [[TMP108]], i32 6 +; OPT-NEXT: [[TMP110:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP111:%.*]] = insertelement <128 x i8> [[TMP109]], i8 [[TMP110]], i32 7 +; OPT-NEXT: [[TMP112:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP113:%.*]] = insertelement <128 x i8> [[TMP111]], i8 [[TMP112]], i32 8 +; OPT-NEXT: [[TMP114:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP115:%.*]] = insertelement <128 x i8> [[TMP113]], i8 [[TMP114]], i32 9 +; OPT-NEXT: [[TMP116:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP117:%.*]] = insertelement <128 x i8> [[TMP115]], i8 [[TMP116]], i32 10 +; OPT-NEXT: [[TMP118:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP119:%.*]] = insertelement <128 x i8> [[TMP117]], i8 [[TMP118]], i32 11 +; OPT-NEXT: [[TMP120:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP121:%.*]] = insertelement <128 x i8> [[TMP119]], i8 [[TMP120]], i32 12 +; OPT-NEXT: [[TMP122:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP123:%.*]] = insertelement <128 x i8> [[TMP121]], i8 [[TMP122]], i32 13 +; OPT-NEXT: [[TMP124:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP125:%.*]] = insertelement <128 x i8> [[TMP123]], i8 [[TMP124]], i32 14 +; OPT-NEXT: [[TMP126:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP127:%.*]] = insertelement <128 x i8> [[TMP125]], i8 [[TMP126]], i32 15 +; OPT-NEXT: [[TMP128:%.*]] = extractelement <16 x i8> [[IN]], i64 0 +; OPT-NEXT: [[TMP129:%.*]] = insertelement <128 x i8> [[TMP127]], i8 [[TMP128]], i32 0 +; OPT-NEXT: [[TMP130:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP131:%.*]] = insertelement <128 x i8> [[TMP129]], i8 [[TMP130]], i32 1 +; OPT-NEXT: [[TMP132:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP133:%.*]] = insertelement <128 x i8> [[TMP131]], i8 [[TMP132]], i32 2 +; OPT-NEXT: [[TMP134:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP135:%.*]] = insertelement <128 x i8> [[TMP133]], i8 [[TMP134]], i32 3 +; OPT-NEXT: [[TMP136:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP137:%.*]] = insertelement <128 x i8> [[TMP135]], i8 [[TMP136]], i32 4 +; OPT-NEXT: [[TMP138:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP139:%.*]] = insertelement <128 x i8> [[TMP137]], i8 [[TMP138]], i32 5 +; OPT-NEXT: [[TMP140:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP141:%.*]] = insertelement <128 x i8> [[TMP139]], i8 [[TMP140]], i32 6 +; OPT-NEXT: [[TMP142:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP143:%.*]] = insertelement <128 x i8> [[TMP141]], i8 [[TMP142]], i32 7 +; OPT-NEXT: [[TMP144:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP145:%.*]] = insertelement <128 x i8> [[TMP143]], i8 [[TMP144]], i32 8 +; OPT-NEXT: [[TMP146:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP147:%.*]] = insertelement <128 x i8> [[TMP145]], i8 [[TMP146]], i32 9 +; OPT-NEXT: [[TMP148:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP149:%.*]] = insertelement <128 x i8> [[TMP147]], i8 [[TMP148]], i32 10 +; OPT-NEXT: [[TMP150:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP151:%.*]] = insertelement <128 x i8> [[TMP149]], i8 [[TMP150]], i32 11 +; OPT-NEXT: [[TMP152:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP153:%.*]] = insertelement <128 x i8> [[TMP151]], i8 [[TMP152]], i32 12 +; OPT-NEXT: [[TMP154:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP155:%.*]] = insertelement <128 x i8> [[TMP153]], i8 [[TMP154]], i32 13 +; OPT-NEXT: [[TMP156:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP157:%.*]] = insertelement <128 x i8> [[TMP155]], i8 [[TMP156]], i32 14 +; OPT-NEXT: [[TMP158:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP159:%.*]] = insertelement <128 x i8> [[TMP157]], i8 [[TMP158]], i32 15 +; OPT-NEXT: [[TMP160:%.*]] = extractelement <16 x i8> [[IN]], i64 0 +; OPT-NEXT: [[TMP161:%.*]] = insertelement <128 x i8> [[TMP159]], i8 [[TMP160]], i32 0 +; OPT-NEXT: [[TMP162:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP163:%.*]] = insertelement <128 x i8> [[TMP161]], i8 [[TMP162]], i32 1 +; OPT-NEXT: [[TMP164:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP165:%.*]] = insertelement <128 x i8> [[TMP163]], i8 [[TMP164]], i32 2 +; OPT-NEXT: [[TMP166:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP167:%.*]] = insertelement <128 x i8> [[TMP165]], i8 [[TMP166]], i32 3 +; OPT-NEXT: [[TMP168:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP169:%.*]] = insertelement <128 x i8> [[TMP167]], i8 [[TMP168]], i32 4 +; OPT-NEXT: [[TMP170:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP171:%.*]] = insertelement <128 x i8> [[TMP169]], i8 [[TMP170]], i32 5 +; OPT-NEXT: [[TMP172:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP173:%.*]] = insertelement <128 x i8> [[TMP171]], i8 [[TMP172]], i32 6 +; OPT-NEXT: [[TMP174:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP175:%.*]] = insertelement <128 x i8> [[TMP173]], i8 [[TMP174]], i32 7 +; OPT-NEXT: [[TMP176:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP177:%.*]] = insertelement <128 x i8> [[TMP175]], i8 [[TMP176]], i32 8 +; OPT-NEXT: [[TMP178:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP179:%.*]] = insertelement <128 x i8> [[TMP177]], i8 [[TMP178]], i32 9 +; OPT-NEXT: [[TMP180:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP181:%.*]] = insertelement <128 x i8> [[TMP179]], i8 [[TMP180]], i32 10 +; OPT-NEXT: [[TMP182:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP183:%.*]] = insertelement <128 x i8> [[TMP181]], i8 [[TMP182]], i32 11 +; OPT-NEXT: [[TMP184:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP185:%.*]] = insertelement <128 x i8> [[TMP183]], i8 [[TMP184]], i32 12 +; OPT-NEXT: [[TMP186:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP187:%.*]] = insertelement <128 x i8> [[TMP185]], i8 [[TMP186]], i32 13 +; OPT-NEXT: [[TMP188:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP189:%.*]] = insertelement <128 x i8> [[TMP187]], i8 [[TMP188]], i32 14 +; OPT-NEXT: [[TMP190:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP191:%.*]] = insertelement <128 x i8> [[TMP189]], i8 [[TMP190]], i32 15 +; OPT-NEXT: [[TMP192:%.*]] = extractelement <16 x i8> [[IN]], i64 0 +; OPT-NEXT: [[TMP193:%.*]] = insertelement <128 x i8> [[TMP191]], i8 [[TMP192]], i32 0 +; OPT-NEXT: [[TMP194:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP195:%.*]] = insertelement <128 x i8> [[TMP193]], i8 [[TMP194]], i32 1 +; OPT-NEXT: [[TMP196:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP197:%.*]] = insertelement <128 x i8> [[TMP195]], i8 [[TMP196]], i32 2 +; OPT-NEXT: [[TMP198:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP199:%.*]] = insertelement <128 x i8> [[TMP197]], i8 [[TMP198]], i32 3 +; OPT-NEXT: [[TMP200:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP201:%.*]] = insertelement <128 x i8> [[TMP199]], i8 [[TMP200]], i32 4 +; OPT-NEXT: [[TMP202:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP203:%.*]] = insertelement <128 x i8> [[TMP201]], i8 [[TMP202]], i32 5 +; OPT-NEXT: [[TMP204:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP205:%.*]] = insertelement <128 x i8> [[TMP203]], i8 [[TMP204]], i32 6 +; OPT-NEXT: [[TMP206:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP207:%.*]] = insertelement <128 x i8> [[TMP205]], i8 [[TMP206]], i32 7 +; OPT-NEXT: [[TMP208:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP209:%.*]] = insertelement <128 x i8> [[TMP207]], i8 [[TMP208]], i32 8 +; OPT-NEXT: [[TMP210:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP211:%.*]] = insertelement <128 x i8> [[TMP209]], i8 [[TMP210]], i32 9 +; OPT-NEXT: [[TMP212:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP213:%.*]] = insertelement <128 x i8> [[TMP211]], i8 [[TMP212]], i32 10 +; OPT-NEXT: [[TMP214:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP215:%.*]] = insertelement <128 x i8> [[TMP213]], i8 [[TMP214]], i32 11 +; OPT-NEXT: [[TMP216:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP217:%.*]] = insertelement <128 x i8> [[TMP215]], i8 [[TMP216]], i32 12 +; OPT-NEXT: [[TMP218:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP219:%.*]] = insertelement <128 x i8> [[TMP217]], i8 [[TMP218]], i32 13 +; OPT-NEXT: [[TMP220:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP221:%.*]] = insertelement <128 x i8> [[TMP219]], i8 [[TMP220]], i32 14 +; OPT-NEXT: [[TMP222:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP223:%.*]] = insertelement <128 x i8> [[TMP221]], i8 [[TMP222]], i32 15 +; OPT-NEXT: [[TMP224:%.*]] = extractelement <16 x i8> [[IN]], i64 0 +; OPT-NEXT: [[TMP225:%.*]] = insertelement <128 x i8> [[TMP223]], i8 [[TMP224]], i32 0 +; OPT-NEXT: [[TMP226:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP227:%.*]] = insertelement <128 x i8> [[TMP225]], i8 [[TMP226]], i32 1 +; OPT-NEXT: [[TMP228:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP229:%.*]] = insertelement <128 x i8> [[TMP227]], i8 [[TMP228]], i32 2 +; OPT-NEXT: [[TMP230:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP231:%.*]] = insertelement <128 x i8> [[TMP229]], i8 [[TMP230]], i32 3 +; OPT-NEXT: [[TMP232:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP233:%.*]] = insertelement <128 x i8> [[TMP231]], i8 [[TMP232]], i32 4 +; OPT-NEXT: [[TMP234:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP235:%.*]] = insertelement <128 x i8> [[TMP233]], i8 [[TMP234]], i32 5 +; OPT-NEXT: [[TMP236:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP237:%.*]] = insertelement <128 x i8> [[TMP235]], i8 [[TMP236]], i32 6 +; OPT-NEXT: [[TMP238:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP239:%.*]] = insertelement <128 x i8> [[TMP237]], i8 [[TMP238]], i32 7 +; OPT-NEXT: [[TMP240:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP241:%.*]] = insertelement <128 x i8> [[TMP239]], i8 [[TMP240]], i32 8 +; OPT-NEXT: [[TMP242:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP243:%.*]] = insertelement <128 x i8> [[TMP241]], i8 [[TMP242]], i32 9 +; OPT-NEXT: [[TMP244:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP245:%.*]] = insertelement <128 x i8> [[TMP243]], i8 [[TMP244]], i32 10 +; OPT-NEXT: [[TMP246:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP247:%.*]] = insertelement <128 x i8> [[TMP245]], i8 [[TMP246]], i32 11 +; OPT-NEXT: [[TMP248:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP249:%.*]] = insertelement <128 x i8> [[TMP247]], i8 [[TMP248]], i32 12 +; OPT-NEXT: [[TMP250:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP251:%.*]] = insertelement <128 x i8> [[TMP249]], i8 [[TMP250]], i32 13 +; OPT-NEXT: [[TMP252:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP253:%.*]] = insertelement <128 x i8> [[TMP251]], i8 [[TMP252]], i32 14 +; OPT-NEXT: [[TMP254:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP255:%.*]] = insertelement <128 x i8> [[TMP253]], i8 [[TMP254]], i32 15 +; OPT-NEXT: [[TMP256:%.*]] = extractelement <128 x i8> [[TMP255]], i32 80 +; OPT-NEXT: [[TMP257:%.*]] = insertelement <16 x i8> poison, i8 [[TMP256]], i64 0 +; OPT-NEXT: [[TMP258:%.*]] = extractelement <128 x i8> [[TMP255]], i32 81 +; OPT-NEXT: [[TMP259:%.*]] = insertelement <16 x i8> [[TMP257]], i8 [[TMP258]], i64 1 +; OPT-NEXT: [[TMP260:%.*]] = extractelement <128 x i8> [[TMP255]], i32 82 +; OPT-NEXT: [[TMP261:%.*]] = insertelement <16 x i8> [[TMP259]], i8 [[TMP260]], i64 2 +; OPT-NEXT: [[TMP262:%.*]] = extractelement <128 x i8> [[TMP255]], i32 83 +; OPT-NEXT: [[TMP263:%.*]] = insertelement <16 x i8> [[TMP261]], i8 [[TMP262]], i64 3 +; OPT-NEXT: [[TMP264:%.*]] = extractelement <128 x i8> [[TMP255]], i32 84 +; OPT-NEXT: [[TMP265:%.*]] = insertelement <16 x i8> [[TMP263]], i8 [[TMP264]], i64 4 +; OPT-NEXT: [[TMP266:%.*]] = extractelement <128 x i8> [[TMP255]], i32 85 +; OPT-NEXT: [[TMP267:%.*]] = insertelement <16 x i8> [[TMP265]], i8 [[TMP266]], i64 5 +; OPT-NEXT: [[TMP268:%.*]] = extractelement <128 x i8> [[TMP255]], i32 86 +; OPT-NEXT: [[TMP269:%.*]] = insertelement <16 x i8> [[TMP267]], i8 [[TMP268]], i64 6 +; OPT-NEXT: [[TMP270:%.*]] = extractelement <128 x i8> [[TMP255]], i32 87 +; OPT-NEXT: [[TMP271:%.*]] = insertelement <16 x i8> [[TMP269]], i8 [[TMP270]], i64 7 +; OPT-NEXT: [[TMP272:%.*]] = extractelement <128 x i8> [[TMP255]], i32 88 +; OPT-NEXT: [[TMP273:%.*]] = insertelement <16 x i8> [[TMP271]], i8 [[TMP272]], i64 8 +; OPT-NEXT: [[TMP274:%.*]] = extractelement <128 x i8> [[TMP255]], i32 89 +; OPT-NEXT: [[TMP275:%.*]] = insertelement <16 x i8> [[TMP273]], i8 [[TMP274]], i64 9 +; OPT-NEXT: [[TMP276:%.*]] = extractelement <128 x i8> [[TMP255]], i32 90 +; OPT-NEXT: [[TMP277:%.*]] = insertelement <16 x i8> [[TMP275]], i8 [[TMP276]], i64 10 +; OPT-NEXT: [[TMP278:%.*]] = extractelement <128 x i8> [[TMP255]], i32 91 +; OPT-NEXT: [[TMP279:%.*]] = insertelement <16 x i8> [[TMP277]], i8 [[TMP278]], i64 11 +; OPT-NEXT: [[TMP280:%.*]] = extractelement <128 x i8> [[TMP255]], i32 92 +; OPT-NEXT: [[TMP281:%.*]] = insertelement <16 x i8> [[TMP279]], i8 [[TMP280]], i64 12 +; OPT-NEXT: [[TMP282:%.*]] = extractelement <128 x i8> [[TMP255]], i32 93 +; OPT-NEXT: [[TMP283:%.*]] = insertelement <16 x i8> [[TMP281]], i8 [[TMP282]], i64 13 +; OPT-NEXT: [[TMP284:%.*]] = extractelement <128 x i8> [[TMP255]], i32 94 +; OPT-NEXT: [[TMP285:%.*]] = insertelement <16 x i8> [[TMP283]], i8 [[TMP284]], i64 14 +; OPT-NEXT: [[TMP286:%.*]] = extractelement <128 x i8> [[TMP255]], i32 95 +; OPT-NEXT: [[TMP287:%.*]] = insertelement <16 x i8> [[TMP285]], i8 [[TMP286]], i64 15 +; OPT-NEXT: [[SUM:%.*]] = add <16 x i8> [[TMP287]], [[ADD]] +; OPT-NEXT: store <16 x i8> [[SUM]], ptr addrspace(3) [[OUT]], align 16 +; OPT-NEXT: ret void +; +entry: + %alloca = alloca [8 x <16 x i8>], align 16, addrspace(5) + %gep0 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 0 + store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 + %gep1 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 1 + store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 + %gep2 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 2 + store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 + %gep3 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 3 + store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 + %gep4 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 4 + store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 + %gep5 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 5 + store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 + %gep6 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 6 + store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 + %gep7 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 7 + store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 + %load = load <16 x i8>, ptr addrspace(5) %gep5, align 16 + %sum = add <16 x i8> %load, %add + store <16 x i8> %sum, ptr addrspace(3) %out, align 16 + ret void +} + +attributes #0 = {"amdgpu-waves-per-eu"="2,2"} diff --git a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll index f67cbe3..ddb522a8 100644 --- a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll +++ b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll @@ -1,17 +1,17 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s ; -global-isel=1 SI run line skipped since store not yet implemented. ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=MEMTIME -check-prefix=GCN %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=MEMTIME -check-prefix=GCN %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=MEMTIME -check-prefix=GCN %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=MEMTIME -check-prefix=GCN %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=MEMTIME -check-prefix=GCN %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=MEMTIME -check-prefix=GCN %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-SDAG -check-prefix=GCN %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-GISEL -check-prefix=GCN %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-GISEL -check-prefix=GCN %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s declare i64 @llvm.readcyclecounter() #0 diff --git a/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll b/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll new file mode 100644 index 0000000..22e4a24 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll @@ -0,0 +1,236 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s + +define i16 @v_underflow_compare_fold_i16(i16 %a, i16 %b) #0 { +; GFX9-LABEL: v_underflow_compare_fold_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_u16_e32 v1, v0, v1 +; GFX9-NEXT: v_min_u16_e32 v0, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_underflow_compare_fold_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sub_nc_u16 v0.h, v0.l, v1.l +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_u16 v0.l, v0.h, v0.l +; GFX11-NEXT: s_setpc_b64 s[30:31] + %sub = sub i16 %a, %b + %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a) + ret i16 %cond +} + +define i32 @v_underflow_compare_fold_i32(i32 %a, i32 %b) #0 { +; GFX9-LABEL: v_underflow_compare_fold_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_u32_e32 v1, v0, v1 +; GFX9-NEXT: v_min_u32_e32 v0, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_underflow_compare_fold_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sub_nc_u32_e32 v1, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_u32_e32 v0, v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %sub = sub i32 %a, %b + %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a) + ret i32 %cond +} + +define i32 @v_underflow_compare_fold_i32_commute(i32 %a, i32 %b) #0 { +; GFX9-LABEL: v_underflow_compare_fold_i32_commute: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_u32_e32 v1, v0, v1 +; GFX9-NEXT: v_min_u32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_underflow_compare_fold_i32_commute: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sub_nc_u32_e32 v1, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_u32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %sub = sub i32 %a, %b + %cond = call i32 @llvm.umin.i32(i32 %a, i32 %sub) + ret i32 %cond +} + +define i32 @v_underflow_compare_fold_i32_multi_use(i32 %a, i32 %b, ptr addrspace(1) %ptr) #0 { +; GFX9-LABEL: v_underflow_compare_fold_i32_multi_use: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_u32_e32 v1, v0, v1 +; GFX9-NEXT: v_min_u32_e32 v0, v1, v0 +; GFX9-NEXT: global_store_dword v[2:3], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_underflow_compare_fold_i32_multi_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sub_nc_u32_e32 v1, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_u32_e32 v0, v1, v0 +; GFX11-NEXT: global_store_b32 v[2:3], v1, off +; GFX11-NEXT: s_setpc_b64 s[30:31] + %sub = sub i32 %a, %b + store i32 %sub, ptr addrspace(1) %ptr + %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a) + ret i32 %cond +} + +define i64 @v_underflow_compare_fold_i64(i64 %a, i64 %b) #0 { +; GFX9-LABEL: v_underflow_compare_fold_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_underflow_compare_fold_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_cndmask_b32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %sub = sub i64 %a, %b + %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a) + ret i64 %cond +} + +define i64 @v_underflow_compare_fold_i64_commute(i64 %a, i64 %b) #0 { +; GFX9-LABEL: v_underflow_compare_fold_i64_commute: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_underflow_compare_fold_i64_commute: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %sub = sub i64 %a, %b + %cond = call i64 @llvm.umin.i64(i64 %a, i64 %sub) + ret i64 %cond +} + +define i64 @v_underflow_compare_fold_i64_multi_use(i64 %a, i64 %b, ptr addrspace(1) %ptr) #0 { +; GFX9-LABEL: v_underflow_compare_fold_i64_multi_use: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_underflow_compare_fold_i64_multi_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX11-NEXT: global_store_b64 v[4:5], v[2:3], off +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_cndmask_b32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %sub = sub i64 %a, %b + store i64 %sub, ptr addrspace(1) %ptr + %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a) + ret i64 %cond +} + +define amdgpu_ps i16 @s_underflow_compare_fold_i16(i16 inreg %a, i16 inreg %b) #0 { +; GFX9-LABEL: s_underflow_compare_fold_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_sub_i32 s1, s0, s1 +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff +; GFX9-NEXT: s_min_u32 s0, s1, s0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_underflow_compare_fold_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_sub_i32 s1, s0, s1 +; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_min_u32 s0, s1, s0 +; GFX11-NEXT: ; return to shader part epilog + %sub = sub i16 %a, %b + %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a) + ret i16 %cond +} + +define amdgpu_ps i32 @s_underflow_compare_fold_i32(i32 inreg %a, i32 inreg %b) #0 { +; GFX9-LABEL: s_underflow_compare_fold_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_sub_i32 s1, s0, s1 +; GFX9-NEXT: s_min_u32 s0, s1, s0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_underflow_compare_fold_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_sub_i32 s1, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_min_u32 s0, s1, s0 +; GFX11-NEXT: ; return to shader part epilog + %sub = sub i32 %a, %b + %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a) + ret i32 %cond +} + +define amdgpu_ps i64 @s_underflow_compare_fold_i64(i64 inreg %a, i64 inreg %b) #0 { +; GFX9-LABEL: s_underflow_compare_fold_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_sub_u32 s2, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_subb_u32 s3, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cselect_b32 s1, s3, s1 +; GFX9-NEXT: s_cselect_b32 s0, s2, s0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_underflow_compare_fold_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_sub_u32 s2, s0, s2 +; GFX11-NEXT: s_subb_u32 s3, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[2:3], s[0:1] +; GFX11-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11-NEXT: s_cselect_b32 s0, s2, s0 +; GFX11-NEXT: s_cselect_b32 s1, s3, s1 +; GFX11-NEXT: ; return to shader part epilog + %sub = sub i64 %a, %b + %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a) + ret i64 %cond +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir index 8a70a8a..32cc398 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir +++ b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir @@ -36,7 +36,7 @@ body: | ; GCN-NEXT: v_add_f16_e64 v128.l /*v384.l*/, v129.l /*v385.l*/, v130.l /*v386.l*/ $vgpr384_lo16 = V_ADD_F16_t16_e64 0, undef $vgpr385_lo16, 0, undef $vgpr386_lo16, 0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0x8a + ; GCN-NEXT: s_set_vgpr_msb 0x458a ; ASM-SAME: ; msbs: dst=2 src0=2 src1=2 src2=0 ; GCN-NEXT: v_add_f16_e64 v0.h /*v512.h*/, v1.h /*v513.h*/, v2.h /*v514.h*/ $vgpr512_hi16 = V_ADD_F16_t16_e64 0, undef $vgpr513_hi16, 0, undef $vgpr514_hi16, 0, 0, 0, implicit $exec, implicit $mode @@ -50,7 +50,7 @@ body: | ; GCN-NEXT: v_add_f16_e64 v128.l /*v640.l*/, v129.l /*v641.l*/, v130.l /*v642.l*/ $vgpr640_lo16 = V_ADD_F16_t16_e64 0, undef $vgpr641_lo16, 0, undef $vgpr642_lo16, 0, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0xcf + ; GCN-NEXT: s_set_vgpr_msb 0x8acf ; ASM-SAME: ; msbs: dst=3 src0=3 src1=3 src2=0 ; GCN-NEXT: v_add_f16_e64 v0.h /*v768.h*/, v1.h /*v769.h*/, v2.h /*v770.h*/ $vgpr768_hi16 = V_ADD_F16_t16_e64 0, undef $vgpr769_hi16, 0, undef $vgpr770_hi16, 0, 0, 0, implicit $exec, implicit $mode diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir index f508df2..7e1c28f 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir +++ b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir @@ -22,13 +22,13 @@ body: | $vgpr257 = V_MOV_B32_e32 undef $vgpr510, implicit $exec ; Single bit change - ; GCN-NEXT: s_set_vgpr_msb 1 + ; GCN-NEXT: s_set_vgpr_msb 0x4101 ; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=0 ; GCN-NEXT: v_rcp_f32_e64 v255, v2 /*v258*/ $vgpr255 = V_RCP_F32_e64 0, undef $vgpr258, 0, 0, implicit $exec, implicit $mode ; Reset - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x100 ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 ; GCN-NEXT: v_rcp_f32_e64 v255, v1 $vgpr255 = V_RCP_F32_e64 0, undef $vgpr1, 0, 0, implicit $exec, implicit $mode @@ -40,7 +40,7 @@ body: | ; GCN-NEXT: v_add_nc_u32_e32 v0, v253 /*v509*/, v252 /*v508*/ $vgpr0 = V_ADD_U32_e32 undef $vgpr509, undef $vgpr508, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0x44 + ; GCN-NEXT: s_set_vgpr_msb 0x544 ; ASM-SAME: ; msbs: dst=1 src0=0 src1=1 src2=0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_add_f32_e64 v2 /*v258*/, v0, v251 /*v507*/ @@ -48,7 +48,7 @@ body: | ; VOP3 - ; GCN-NEXT: s_set_vgpr_msb 0x55 + ; GCN-NEXT: s_set_vgpr_msb 0x4455 ; ASM-SAME: ; msbs: dst=1 src0=1 src1=1 src2=1 ; GCN-NEXT: v_fma_f32 v3 /*v259*/, v4 /*v260*/, v5 /*v261*/, v6 /*v262*/ $vgpr259 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr261, 0, undef $vgpr262, 0, 0, implicit $exec, implicit $mode @@ -58,32 +58,32 @@ body: | $vgpr259 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr261, 0, undef $vgpr262, 0, 0, implicit $exec, implicit $mode ; Tuple crossing the 256 boundary - ; GCN-NEXT: s_set_vgpr_msb 17 + ; GCN-NEXT: s_set_vgpr_msb 0x5511 ; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=1 ; GCN-NEXT: v_mqsad_u32_u8 v[254:257], v[2:3] /*v[258:259]*/, v0, v[244:247] /*v[500:503]*/ $vgpr254_vgpr255_vgpr256_vgpr257 = V_MQSAD_U32_U8_e64 $vgpr258_vgpr259, $vgpr0, undef $vgpr500_vgpr501_vgpr502_vgpr503, 0, implicit $exec ; DPP/tied operand - ; GCN-NEXT: s_set_vgpr_msb 0x45 + ; GCN-NEXT: s_set_vgpr_msb 0x1145 ; ASM-SAME: ; msbs: dst=1 src0=1 src1=1 src2=0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_add_nc_u16_e64_dpp v0 /*v256*/, v1 /*v257*/, v2 /*v258*/ quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 $vgpr256 = V_ADD_NC_U16_fake16_e64_dpp $vgpr256, 0, $vgpr257, 0, undef $vgpr258, 0, 0, 1, 15, 15, 1, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 17 + ; GCN-NEXT: s_set_vgpr_msb 0x4511 ; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=1 ; GCN-NEXT: v_add3_u32_e64_dpp v0, v1 /*v257*/, v0, v2 /*v258*/ quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 $vgpr0 = V_ADD3_U32_e64_dpp $vgpr0, $vgpr257, $vgpr0, undef $vgpr258, 1, 15, 15, 1, implicit $exec ; DS (addr, data0, and data1 operands) - ; GCN-NEXT: s_set_vgpr_msb 20 + ; GCN-NEXT: s_set_vgpr_msb 0x1114 ; ASM-SAME: ; msbs: dst=0 src0=0 src1=1 src2=1 ; GCN-NEXT: ds_store_2addr_b32 v0, v248 /*v504*/, v249 /*v505*/ offset1:1 DS_WRITE2_B32_gfx9 $vgpr0, undef $vgpr504, undef $vgpr505, 0, 1, 0, implicit $exec ; Reset - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x1400 ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 ; GCN-NEXT: ds_store_2addr_b32 v0, v248, v249 offset1:1 DS_WRITE2_B32_gfx9 $vgpr0, undef $vgpr248, undef $vgpr249, 0, 1, 0, implicit $exec @@ -93,13 +93,13 @@ body: | ; GCN-NEXT: ds_load_b32 v0, v255 /*v511*/ $vgpr0 = DS_READ_B32_gfx9 $vgpr511, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0x44 + ; GCN-NEXT: s_set_vgpr_msb 0x144 ; ASM-SAME: ; msbs: dst=1 src0=0 src1=1 src2=0 ; GCN-NEXT: ds_add_rtn_u32 v255 /*v511*/, v0, v248 /*v504*/ $vgpr511 = DS_ADD_RTN_U32_gfx9 $vgpr0, undef $vgpr504, 0, 0, implicit $exec ; Reset - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4400 ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 ; GCN-NEXT: ds_add_rtn_u32 v0, v0, v0 $vgpr0 = DS_ADD_RTN_U32_gfx9 $vgpr0, $vgpr0, 0, 0, implicit $exec @@ -111,17 +111,17 @@ body: | ; GCN-NEXT: global_load_b32 v2, v[2:3] /*v[258:259]*/, off $vgpr2 = GLOBAL_LOAD_DWORD undef $vgpr258_vgpr259, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 64 + ; GCN-NEXT: s_set_vgpr_msb 0x140 ; ASM-SAME: ; msbs: dst=1 src0=0 src1=0 src2=0 ; GCN-NEXT: global_load_b32 v255 /*v511*/, v0, s[0:1] $vgpr511 = GLOBAL_LOAD_DWORD_SADDR undef $sgpr0_sgpr1, $vgpr0, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 1 + ; GCN-NEXT: s_set_vgpr_msb 0x4001 ; ASM-SAME: ; msbs: dst=0 src0=1 src1=0 src2=0 ; GCN-NEXT: scratch_load_u8 v0, v255 /*v511*/, s0 $vgpr0 = SCRATCH_LOAD_UBYTE_SVS $vgpr511, undef $sgpr0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x100 ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 ; GCN-NEXT: global_store_b32 v[0:1], v2, off GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec @@ -135,13 +135,13 @@ body: | ; GCN-NEXT: global_store_b96 v[0:1] /*v[256:257]*/, v[244:246] /*v[500:502]*/, off GLOBAL_STORE_DWORDX3 $vgpr256_vgpr257, $vgpr500_vgpr501_vgpr502, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0x44 + ; GCN-NEXT: s_set_vgpr_msb 0x544 ; ASM-SAME: ; msbs: dst=1 src0=0 src1=1 src2=0 ; GCN-NEXT: flat_atomic_add_u32 v254 /*v510*/, v[0:1], v255 /*v511*/ th:TH_ATOMIC_RETURN $vgpr510 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr511, 0, 1, implicit $exec, implicit $flat_scr ; Reset - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4400 ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 ; GCN-NEXT: flat_atomic_add_u32 v0, v[0:1], v255 th:TH_ATOMIC_RETURN $vgpr0 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr255, 0, 1, implicit $exec, implicit $flat_scr @@ -156,12 +156,12 @@ body: | ; GCN-NEXT: buffer_load_b32 v1 /*v257*/, v0, s[8:11], s3 offen $vgpr257 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN $vgpr0, undef $sgpr8_sgpr9_sgpr10_sgpr11, undef $sgpr3, 0, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0x41 + ; GCN-NEXT: s_set_vgpr_msb 0x4041 ; ASM-SAME: ; msbs: dst=1 src0=1 src1=0 src2=0 ; GCN-NEXT: buffer_load_b32 v1 /*v257*/, v0 /*v256*/, s[8:11], s3 offen $vgpr257 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN $vgpr256, undef $sgpr8_sgpr9_sgpr10_sgpr11, undef $sgpr3, 0, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4100 ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 ; GCN-NEXT: buffer_store_b32 v0, v1, s[0:3], s3 offen BUFFER_STORE_DWORD_VBUFFER_OFFEN $vgpr0, $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, 0, implicit $exec @@ -171,7 +171,7 @@ body: | ; GCN-NEXT: buffer_store_b32 v0 /*v256*/, v1 /*v257*/, s[0:3], s3 offen BUFFER_STORE_DWORD_VBUFFER_OFFEN $vgpr256, $vgpr257, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4100 ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 ; GCN-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s3 offen BUFFER_ATOMIC_ADD_F32_VBUFFER_OFFEN $vgpr0, $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr3, 0, 0, implicit $exec @@ -183,44 +183,44 @@ body: | ; VGPRs above 512 - ; GCN-NEXT: s_set_vgpr_msb 0xaa + ; GCN-NEXT: s_set_vgpr_msb 0x41aa ; ASM-SAME: ; msbs: dst=2 src0=2 src1=2 src2=2 ; GCN-NEXT: v_fma_f32 v0 /*v512*/, v1 /*v513*/, v2 /*v514*/, v3 /*v515*/ $vgpr512 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr514, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0xab + ; GCN-NEXT: s_set_vgpr_msb 0xaaab ; ASM-SAME: ; msbs: dst=2 src0=3 src1=2 src2=2 ; GCN-NEXT: v_fma_f32 v0 /*v512*/, v0 /*v768*/, v2 /*v514*/, v3 /*v515*/ $vgpr512 = V_FMA_F32_e64 0, undef $vgpr768, 0, undef $vgpr514, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0xae + ; GCN-NEXT: s_set_vgpr_msb 0xabae ; ASM-SAME: ; msbs: dst=2 src0=2 src1=3 src2=2 ; GCN-NEXT: v_fma_f32 v0 /*v512*/, v1 /*v513*/, v2 /*v770*/, v3 /*v515*/ $vgpr512 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr770, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0xba + ; GCN-NEXT: s_set_vgpr_msb 0xaeba ; ASM-SAME: ; msbs: dst=2 src0=2 src1=2 src2=3 ; GCN-NEXT: v_fma_f32 v0 /*v512*/, v1 /*v513*/, v2 /*v514*/, v3 /*v771*/ $vgpr512 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr514, 0, undef $vgpr771, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0xea + ; GCN-NEXT: s_set_vgpr_msb 0xbaea ; ASM-SAME: ; msbs: dst=3 src0=2 src1=2 src2=2 ; GCN-NEXT: v_fma_f32 v255 /*v1023*/, v1 /*v513*/, v2 /*v514*/, v3 /*v515*/ $vgpr1023 = V_FMA_F32_e64 0, undef $vgpr513, 0, undef $vgpr514, 0, undef $vgpr515, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0xff + ; GCN-NEXT: s_set_vgpr_msb 0xeaff ; ASM-SAME: ; msbs: dst=3 src0=3 src1=3 src2=3 ; GCN-NEXT: v_fma_f32 v0 /*v768*/, v1 /*v769*/, v2 /*v770*/, v3 /*v771*/ $vgpr768 = V_FMA_F32_e64 0, undef $vgpr769, 0, undef $vgpr770, 0, undef $vgpr771, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0x42 + ; GCN-NEXT: s_set_vgpr_msb 0xff42 ; ASM-SAME: ; msbs: dst=1 src0=2 src1=0 src2=0 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v0 /*v512*/ $vgpr256 = V_MOV_B32_e32 undef $vgpr512, implicit $exec ; Reset - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4200 ; ASM-SAME: ; msbs: dst=0 src0=0 src1=0 src2=0 ; GCN-NEXT: v_fma_f32 v0, v1, v2, v3 $vgpr0 = V_FMA_F32_e64 0, undef $vgpr1, 0, undef $vgpr2, 0, undef $vgpr3, 0, 0, implicit $exec, implicit $mode @@ -232,12 +232,12 @@ body: | ; GCN-NEXT: global_store_b96 v[0:1] /*v[512:513]*/, v[0:2] /*v[512:514]*/, off GLOBAL_STORE_DWORDX3 $vgpr512_vgpr513, $vgpr512_vgpr513_vgpr514, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 11 + ; GCN-NEXT: s_set_vgpr_msb 0xa0b ; ASM-SAME: ; msbs: dst=0 src0=3 src1=2 src2=0 ; GCN-NEXT: global_store_b64 v[254:255] /*v[1022:1023]*/, v[254:255] /*v[766:767]*/, off GLOBAL_STORE_DWORDX2 $vgpr1022_vgpr1023, $vgpr766_vgpr767, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0x55 + ; GCN-NEXT: s_set_vgpr_msb 0xb55 ; ASM-SAME: ; msbs: dst=1 src0=1 src1=1 src2=1 ; GCN-NEXT: v_wmma_f32_16x16x32_bf16 v[14:21] /*v[270:277]*/, v[26:33] /*v[282:289]*/, v[34:41] /*v[290:297]*/, v[14:21] /*v[270:277]*/ early-clobber $vgpr270_vgpr271_vgpr272_vgpr273_vgpr274_vgpr275_vgpr276_vgpr277 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, undef $vgpr282_vgpr283_vgpr284_vgpr285_vgpr286_vgpr287_vgpr288_vgpr289, 8, undef $vgpr290_vgpr291_vgpr292_vgpr293_vgpr294_vgpr295_vgpr296_vgpr297, 8, killed undef $vgpr270_vgpr271_vgpr272_vgpr273_vgpr274_vgpr275_vgpr276_vgpr277, 0, 0, 0, 0, implicit $exec @@ -247,6 +247,7 @@ body: | ... # ASM-LABEL: {{^}}vopd: + # DIS-LABEL: <vopd>: --- name: vopd @@ -262,35 +263,35 @@ body: | ; GCN-NEXT: v_dual_sub_f32 v244 /*v500*/, v1, v2 :: v_dual_mul_f32 v0 /*v256*/, v3, v4 $vgpr500, $vgpr256 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr2, undef $vgpr3, undef $vgpr4, implicit $mode, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0x41 + ; GCN-NEXT: s_set_vgpr_msb 0x4041 ; GCN-NEXT: v_dual_sub_f32 v244 /*v500*/, s1, v2 :: v_dual_mul_f32 v0 /*v256*/, v44 /*v300*/, v4 $vgpr500, $vgpr256 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $sgpr1, undef $vgpr2, undef $vgpr300, undef $vgpr4, implicit $mode, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 4 + ; GCN-NEXT: s_set_vgpr_msb 0x4104 ; GCN-NEXT: v_dual_sub_f32 v255, v1, v44 /*v300*/ :: v_dual_mul_f32 v6, v0, v1 /*v257*/ $vgpr255, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 undef $vgpr1, undef $vgpr300, undef $vgpr0, $vgpr257, implicit $mode, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 1 + ; GCN-NEXT: s_set_vgpr_msb 0x401 ; GCN-NEXT: v_dual_sub_f32 v255, 0, v1 :: v_dual_mul_f32 v6, v44 /*v300*/, v3 $vgpr255, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1250 0, undef $vgpr1, undef $vgpr300, undef $vgpr3, implicit $mode, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 64 + ; GCN-NEXT: s_set_vgpr_msb 0x140 ; GCN-NEXT: v_dual_fmamk_f32 v243 /*v499*/, v0, 0xa, v3 :: v_dual_fmac_f32 v0 /*v256*/, v1, v1 $vgpr499, $vgpr256 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx1250 undef $vgpr0, 10, undef $vgpr3, undef $vgpr1, undef $vgpr1, $vgpr256, implicit $mode, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 5 + ; GCN-NEXT: s_set_vgpr_msb 0x4005 ; GCN-NEXT: v_dual_mov_b32 v2, v3 /*v259*/ :: v_dual_add_f32 v3, v1 /*v257*/, v2 /*v258*/ $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx1250 undef $vgpr259, undef $vgpr257, undef $vgpr258, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0x44 + ; GCN-NEXT: s_set_vgpr_msb 0x544 ; GCN-NEXT: v_dual_fmamk_f32 v244 /*v500*/, v0, 0xa, v44 /*v300*/ :: v_dual_fmac_f32 v3 /*v259*/, v1, v1 /*v257*/ $vgpr500, $vgpr259 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx1250 undef $vgpr0, 10, undef $vgpr300, undef $vgpr1, undef $vgpr257, $vgpr259, implicit $mode, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 16 + ; GCN-NEXT: s_set_vgpr_msb 0x4410 ; GCN-NEXT: v_dual_fma_f32 v0, v6, v6, v44 /*v300*/ :: v_dual_fma_f32 v1, v4, v5, v45 /*v301*/ $vgpr0, $vgpr1 = V_DUAL_FMA_F32_e64_X_FMA_F32_e64_e96_gfx1250 0, undef $vgpr6, 0, undef $vgpr6, 0, undef $vgpr300, 0, undef $vgpr4, 0, undef $vgpr5, 0, undef $vgpr301, implicit $mode, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x1000 ; GCN-NEXT: v_dual_fmac_f32 v2, v6, v6 :: v_dual_fma_f32 v3, v4, v5, v3 $vgpr2, $vgpr3 = V_DUAL_FMAC_F32_e32_X_FMA_F32_e64_e96_gfx1250 0, undef $vgpr6, 0, undef $vgpr6, undef $vgpr2, 0, undef $vgpr4, 0, undef $vgpr5, 0, $vgpr3, implicit $mode, implicit $exec @@ -298,7 +299,7 @@ body: | ; GCN-NEXT: v_dual_fma_f32 v244 /*v500*/, v6, v7, v8 :: v_dual_add_f32 v3 /*v259*/, v4, v5 $vgpr500, $vgpr259 = V_DUAL_FMA_F32_e64_X_ADD_F32_e32_e96_gfx1250 0, undef $vgpr6, 0, undef $vgpr7, 0, undef $vgpr8, 0, undef $vgpr4, 0, undef $vgpr5, implicit $mode, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0xae + ; GCN-NEXT: s_set_vgpr_msb 0x40ae ; GCN-NEXT: v_dual_fmac_f32 v2 /*v514*/, v6 /*v518*/, v8 /*v776*/ :: v_dual_fma_f32 v3 /*v515*/, v4 /*v516*/, v7 /*v775*/, v3 /*v515*/ $vgpr514, $vgpr515 = V_DUAL_FMAC_F32_e32_X_FMA_F32_e64_e96_gfx1250 0, undef $vgpr518, 0, undef $vgpr776, undef $vgpr514, 0, undef $vgpr516, 0, undef $vgpr775, 0, $vgpr515, implicit $mode, implicit $exec @@ -319,31 +320,31 @@ body: | ; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v1 /*v257*/, v2 /*v258*/, 0x1 $vgpr256 = V_FMAAK_F32 undef $vgpr257, undef $vgpr258, 1, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 5 + ; GCN-NEXT: s_set_vgpr_msb 0x4505 ; GCN-NEXT: v_fmaak_f32 v0, v1 /*v257*/, v2 /*v258*/, 0x1 $vgpr0 = V_FMAAK_F32 undef $vgpr257, undef $vgpr258, 1, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0x41 + ; GCN-NEXT: s_set_vgpr_msb 0x541 ; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v1 /*v257*/, v2, 0x1 $vgpr256 = V_FMAAK_F32 undef $vgpr257, undef $vgpr2, 1, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0x44 + ; GCN-NEXT: s_set_vgpr_msb 0x4144 ; GCN-NEXT: v_fmaak_f32 v0 /*v256*/, v1, v2 /*v258*/, 0x1 $vgpr256 = V_FMAAK_F32 undef $vgpr1, undef $vgpr258, 1, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0x45 + ; GCN-NEXT: s_set_vgpr_msb 0x4445 ; GCN-NEXT: v_fmamk_f32 v0 /*v256*/, v1 /*v257*/, 0x1, v2 /*v258*/ $vgpr256 = V_FMAMK_F32 undef $vgpr257, 1, undef $vgpr258, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 5 + ; GCN-NEXT: s_set_vgpr_msb 0x4505 ; GCN-NEXT: v_fmamk_f32 v0, v1 /*v257*/, 0x1, v2 /*v258*/ $vgpr0 = V_FMAMK_F32 undef $vgpr257, 1, undef $vgpr258, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0x41 + ; GCN-NEXT: s_set_vgpr_msb 0x541 ; GCN-NEXT: v_fmamk_f32 v0 /*v256*/, v1 /*v257*/, 0x1, v2 $vgpr256 = V_FMAMK_F32 undef $vgpr257, 1, undef $vgpr2, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0x44 + ; GCN-NEXT: s_set_vgpr_msb 0x4144 ; GCN-NEXT: v_fmamk_f32 v0 /*v256*/, v1, 0x1, v2 /*v258*/ $vgpr256 = V_FMAMK_F32 undef $vgpr1, 1, undef $vgpr258, implicit $exec, implicit $mode @@ -389,15 +390,15 @@ body: | ; GCN-NEXT: v_lshlrev_b32_e64 v0, v0 /*v256*/, v2 $vgpr0 = V_LSHLREV_B32_e64 undef $vgpr256, undef $vgpr2, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 4 + ; GCN-NEXT: s_set_vgpr_msb 0x104 ; GCN-NEXT: v_lshlrev_b32_e64 v0, v1, v0 /*v256*/ $vgpr0 = V_LSHLREV_B32_e64 undef $vgpr1, undef $vgpr256, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 1 + ; GCN-NEXT: s_set_vgpr_msb 0x401 ; GCN-NEXT: v_subrev_nc_u32_e32 v0, v0 /*v256*/, v2 $vgpr0 = V_SUBREV_U32_e32 undef $vgpr256, undef $vgpr2, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 4 + ; GCN-NEXT: s_set_vgpr_msb 0x104 ; GCN-NEXT: v_subrev_nc_u32_e32 v0, v1, v0 /*v256*/ $vgpr0 = V_SUBREV_U32_e32 undef $vgpr1, undef $vgpr256, implicit $exec @@ -417,7 +418,7 @@ body: | ; GCN-NEXT: v_fma_f32 v3 /*v259*/, v4 /*v260*/, v5 /*v261*/, v6 /*v262*/ $vgpr259 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr261, 0, undef $vgpr262, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x5500 ; GCN-NEXT: v_add_nc_u32_e32 v0, v1, v2 $vgpr0 = V_ADD_U32_e32 undef $vgpr1, undef $vgpr2, implicit $exec @@ -431,7 +432,7 @@ body: | ; GCN-NEXT: v_add_nc_u32_e32 v0 /*v256*/, v1, v2 $vgpr256 = V_ADD_U32_e32 undef $vgpr1, undef $vgpr2, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; GCN-NEXT: v_fma_f32 v3, v4, v5, s2 $vgpr3 = V_FMA_F32_e64 0, undef $vgpr4, 0, undef $vgpr5, 0, undef $sgpr2, 0, 0, implicit $exec, implicit $mode @@ -439,17 +440,17 @@ body: | ; GCN-NEXT: v_fma_f32 v3, v4 /*v260*/, v5, 1 $vgpr3 = V_FMA_F32_e64 0, undef $vgpr260, 0, undef $vgpr5, 0, 1, 0, 0, implicit $exec, implicit $mode - ; GCN-NEXT: s_set_vgpr_msb 4 + ; GCN-NEXT: s_set_vgpr_msb 0x104 ; GCN-NEXT: v_mov_b32_e32 v0, v1 $vgpr0 = V_MOV_B32_e32 undef $vgpr1, implicit $exec ; GCN-NEXT: v_add_nc_u32_e32 v2, v1, v3 /*v259*/ $vgpr2 = V_ADD_U32_e32 undef $vgpr1, undef $vgpr259, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 1 + ; GCN-NEXT: s_set_vgpr_msb 0x401 ; GCN-NEXT: v_mov_b32_e32 v0, v0 /*v256*/ ; GCN-NEXT: v_add_nc_u32_e32 v1, v1 /*v257*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 5 + ; GCN-NEXT: s_set_vgpr_msb 0x105 ; GCN-NEXT: v_add_nc_u32_e32 v2, v2 /*v258*/, v2 /*v258*/ $vgpr0 = V_MOV_B32_e32 undef $vgpr256, implicit $exec $vgpr1 = V_ADD_U32_e32 undef $vgpr257, undef $vgpr1, implicit $exec @@ -478,16 +479,18 @@ body: | ; ASM: .LBB{{.*_1}}: ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec - ; No mode switch on fall through + ; Reset on fallthrough block end bb.2: ; ASM-NEXT: %bb.2: - ; GCN-NEXT: s_nop 0 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 64 + ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; GCN-NEXT: s_branch - S_NOP 0 + $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec S_BRANCH %bb.3 ; Reset mode on terminator @@ -496,7 +499,7 @@ body: | ; ASM: .LBB{{.*_3}}: ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; GCN-NEXT: s_swap_pc_i64 $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec $exec = S_SWAPPC_B64 undef $sgpr0_sgpr1 @@ -518,7 +521,7 @@ body: | ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; GCN-NEXT: s_set_pc_i64 $vgpr0 = V_MOV_B32_e32 undef $vgpr1, implicit $exec $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec @@ -538,7 +541,7 @@ body: | ; ASM-NEXT: %bb.7: ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; ASM-NEXT: ; return to shader part epilog $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec SI_RETURN_TO_EPILOG undef $vgpr0, implicit-def $exec @@ -556,7 +559,7 @@ body: | ; ASM-NEXT: %bb.9: ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; GCN-NEXT: s_set_pc_i64 $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec S_SETPC_B64_return undef $sgpr0_sgpr1, implicit-def $exec @@ -574,13 +577,14 @@ body: | ; ASM: %bb.0: ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 $vgpr256 = V_MOV_B32_e32 undef $vgpr0, implicit $exec bb.1: ; ASM: .LBB{{[0-9]+}}_1: ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v1 /*v257*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; GCN-NEXT: s_cbranch_scc0 $vgpr257 = V_MOV_B32_e32 undef $vgpr1, implicit $exec S_CBRANCH_SCC0 %bb.1, undef implicit $scc @@ -604,7 +608,7 @@ body: | ; ASM: %bb.0: ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; ASM: def v0 ; GCN-NOT: s_set_vgpr_msb ; ASM: use v0 @@ -638,7 +642,7 @@ body: | ; GCN-NEXT: s_set_vgpr_msb 64 ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 ; GCN-NEXT: s_nop 0 - ; GCN-NEXT: s_set_vgpr_msb 1 + ; GCN-NEXT: s_set_vgpr_msb 0x4001 ; GCN-NEXT: v_mov_b32_e32 v1, v0 /*v256*/ BUNDLE implicit-def $vgpr256 { $vgpr256 = V_MOV_B32_e32 undef $vgpr1, implicit $exec @@ -680,7 +684,7 @@ body: | ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 ; GCN-NEXT: v_mov_b32_e32 v1 /*v257*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; GCN-NEXT: v_mov_b32_e32 v2, v1 ; GCN-NEXT: v_mov_b32_e32 v3, v1 BUNDLE implicit-def $vgpr256, implicit-def $vgpr257, implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr1 { @@ -709,7 +713,7 @@ body: | ; GCN-NEXT: s_clause 0x3e ; GCN-NEXT: v_mov_b32_e32 v0 /*v256*/, v1 - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x4000 ; GCN-NEXT: v_mov_b32_e32 v1, v1 ; GCN-NEXT: v_mov_b32_e32 v2, v1 ; GCN-COUNT-60: v_mov_b32_e32 v1, v1 @@ -823,7 +827,7 @@ body: | ; GCN-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[210:217], v[244:259] /*v[500:515]*/, v[244:259] /*v[500:515]*/, v[10:17], v1, v2 $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr1, undef $vgpr2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x500 ; GCN-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[210:217], v[100:115], v[100:115], v[10:17], v1, v2 $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr1, undef $vgpr2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec @@ -835,11 +839,11 @@ body: | ; GCN-NEXT: v_wmma_ld_scale16_paired_b64 v[0:1], v[2:3] V_WMMA_LD_SCALE16_PAIRED_B64 undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 5 + ; GCN-NEXT: s_set_vgpr_msb 0x105 ; GCN-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[210:217], v[244:259] /*v[500:515]*/, v[244:259] /*v[500:515]*/, v[10:17], v[0:1], v[2:3] $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, undef $vgpr500_vgpr501_vgpr502_vgpr503_vgpr504_vgpr505_vgpr506_vgpr507_vgpr508_vgpr509_vgpr510_vgpr511_vgpr512_vgpr513_vgpr514_vgpr515, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: s_set_vgpr_msb 0 + ; GCN-NEXT: s_set_vgpr_msb 0x500 ; GCN-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[210:217], v[100:115], v[100:115], v[10:17], v[0:1], v[2:3] $vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, undef $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115, 0, undef $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, undef $vgpr0_vgpr1, undef $vgpr2_vgpr3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll index a42c8ac7..7581710 100644 --- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll @@ -3182,7 +3182,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v509*/, s33 offset:1592 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v510*/, s33 offset:1596 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v511*/, s33 offset:1600 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 8 ; msbs: dst=0 src0=0 src1=2 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x408 ; msbs: dst=0 src0=0 src1=2 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v512*/, s33 offset:1604 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v513*/, s33 offset:1608 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v514*/, s33 offset:1612 @@ -3443,7 +3443,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v765*/, s33 offset:2616 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v766*/, s33 offset:2620 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v767*/, s33 offset:2624 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 12 ; msbs: dst=0 src0=0 src1=3 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c ; msbs: dst=0 src0=0 src1=3 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v768*/, s33 offset:2628 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v769*/, s33 offset:2632 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v770*/, s33 offset:2636 @@ -3706,7 +3706,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v1023*/, s33 offset:3648 ; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, -1 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc00 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-DAGISEL-NEXT: v_writelane_b32 v40, s0, 3 @@ -4135,7 +4135,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v509*/, off, s33 offset:1592 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v510*/, off, s33 offset:1596 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v511*/, off, s33 offset:1600 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80 ; msbs: dst=2 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x4080 ; msbs: dst=2 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v512*/, off, s33 offset:1604 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v513*/, off, s33 offset:1608 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v514*/, off, s33 offset:1612 @@ -4396,7 +4396,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v765*/, off, s33 offset:2616 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v766*/, off, s33 offset:2620 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v767*/, off, s33 offset:2624 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc0 ; msbs: dst=3 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c0 ; msbs: dst=3 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v768*/, off, s33 offset:2628 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v769*/, off, s33 offset:2632 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v770*/, off, s33 offset:2636 @@ -4661,7 +4661,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, s4 ; GFX1250-DAGISEL-NEXT: s_mov_b32 s33, s0 ; GFX1250-DAGISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc000 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: s_set_pc_i64 s[30:31] %ret = call amdgpu_gfx <2 x half>(<2 x half>, <2 x half>) @gfx_callee(<2 x half> %y, <2 x half> %x) convergent ret <2 x half> %ret @@ -6346,7 +6346,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v509*/, s32 offset:1588 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v510*/, s32 offset:1592 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v511*/, s32 offset:1596 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 8 ; msbs: dst=0 src0=0 src1=2 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x408 ; msbs: dst=0 src0=0 src1=2 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v512*/, s32 offset:1600 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v513*/, s32 offset:1604 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v514*/, s32 offset:1608 @@ -6607,7 +6607,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v765*/, s32 offset:2612 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v766*/, s32 offset:2616 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v767*/, s32 offset:2620 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 12 ; msbs: dst=0 src0=0 src1=3 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c ; msbs: dst=0 src0=0 src1=3 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v768*/, s32 offset:2624 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v769*/, s32 offset:2628 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v770*/, s32 offset:2632 @@ -6872,7 +6872,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, -1 ; GFX1250-DAGISEL-NEXT: v_mov_b32_e32 v2, v0 ; GFX1250-DAGISEL-NEXT: s_mov_b64 s[36:37], gfx_callee@abs64 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc00 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: v_swap_b32 v0, v1 ; GFX1250-DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1 ; GFX1250-DAGISEL-NEXT: s_clause 0x3e @@ -7283,7 +7283,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v509*/, off, s32 offset:1588 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v510*/, off, s32 offset:1592 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v511*/, off, s32 offset:1596 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80 ; msbs: dst=2 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x4080 ; msbs: dst=2 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v512*/, off, s32 offset:1600 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v513*/, off, s32 offset:1604 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v514*/, off, s32 offset:1608 @@ -7544,7 +7544,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v765*/, off, s32 offset:2612 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v766*/, off, s32 offset:2616 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v767*/, off, s32 offset:2620 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc0 ; msbs: dst=3 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c0 ; msbs: dst=3 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v768*/, off, s32 offset:2624 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v769*/, off, s32 offset:2628 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v770*/, off, s32 offset:2632 @@ -7807,7 +7807,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v1023*/, off, s32 offset:3644 ; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, s0 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc000 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: s_set_pc_i64 s[36:37] %ret = tail call amdgpu_gfx <2 x half>(<2 x half>, <2 x half>) @gfx_callee(<2 x half> %y, <2 x half> %x) convergent ret <2 x half> %ret @@ -9657,7 +9657,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v509*/, s33 offset:1600 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v510*/, s33 offset:1604 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v511*/, s33 offset:1608 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 8 ; msbs: dst=0 src0=0 src1=2 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x408 ; msbs: dst=0 src0=0 src1=2 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v512*/, s33 offset:1612 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v513*/, s33 offset:1616 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v514*/, s33 offset:1620 @@ -9918,7 +9918,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v765*/, s33 offset:2624 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v766*/, s33 offset:2628 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v767*/, s33 offset:2632 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 12 ; msbs: dst=0 src0=0 src1=3 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c ; msbs: dst=0 src0=0 src1=3 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v768*/, s33 offset:2636 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v769*/, s33 offset:2640 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v770*/, s33 offset:2644 @@ -10181,7 +10181,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v1023*/, s33 offset:3656 ; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, -1 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc00 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: s_clause 0x2 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v42, s33 ; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v40, s33 offset:164 @@ -10616,7 +10616,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v509*/, off, s33 offset:1600 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v510*/, off, s33 offset:1604 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v511*/, off, s33 offset:1608 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80 ; msbs: dst=2 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x4080 ; msbs: dst=2 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v512*/, off, s33 offset:1612 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v513*/, off, s33 offset:1616 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v514*/, off, s33 offset:1620 @@ -10877,7 +10877,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v765*/, off, s33 offset:2624 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v766*/, off, s33 offset:2628 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v767*/, off, s33 offset:2632 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc0 ; msbs: dst=3 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80c0 ; msbs: dst=3 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v768*/, off, s33 offset:2636 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v769*/, off, s33 offset:2640 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v770*/, off, s33 offset:2644 @@ -11142,7 +11142,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, s4 ; GFX1250-DAGISEL-NEXT: s_mov_b32 s33, s0 ; GFX1250-DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc000 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-DAGISEL-NEXT: s_set_pc_i64 s[30:31] %ret = call float(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @callee, <8 x float> %x) convergent store float %ret, ptr %p diff --git a/llvm/test/CodeGen/Hexagon/isel-fclass.ll b/llvm/test/CodeGen/Hexagon/isel-fclass.ll new file mode 100644 index 0000000..96b0210 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/isel-fclass.ll @@ -0,0 +1,86 @@ +; Tests lowering of sfclass/dfclass compares. +; Sub-optimal code +; { +; p0 = sfclass(r0,#16) +; r0 = sfadd(r0,r0) +; } +; { +; r2 = p0 +; } +; { +; if (p0.new) r0 = ##1065353216 +; p0 = cmp.eq(r2,#0) +; jumpr r31 +; } +; With the patterns added, we should be generating +; { +; p0 = sfclass(r0,#16) +; r0 = sfadd(r0,r0) +; } +; { +; if (!p0) r0 = ##1065353216 +; jumpr r31 +; } + +; RUN: llc -march=hexagon -stop-after=hexagon-isel %s -o - | FileCheck %s + +; CHECK: bb.0.entry1 +; CHECK: F2_sfclass +; CHECK-NOT: C2_cmp +; CHECK: C2_not +; CHECK: F2_sfadd +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +define float @test1(float noundef %x) { +entry1: + %0 = tail call i32 @llvm.hexagon.F2.sfclass(float %x, i32 16) + %tobool.not = icmp eq i32 %0, 0 + %add = fadd float %x, %x + %spec.select = select i1 %tobool.not, float 1.000000e+00, float %add + ret float %spec.select +} + +; CHECK: bb.0.entry2 +; CHECK: F2_sfclass +; CHECK-NOT: C2_cmp +; CHECK: F2_sfadd +define float @test2(float noundef %x) { +entry2: + %0 = tail call i32 @llvm.hexagon.F2.sfclass(float %x, i32 16) + %tobool.not = icmp eq i32 %0, 0 + %add = fadd float %x, %x + %spec.select = select i1 %tobool.not, float %add, float 1.000000e+00 + ret float %spec.select +} + +; CHECK: bb.0.entry3 +; CHECK: F2_dfclass +; CHECK-NOT: C2_cmp +; CHECK: C2_not +; CHECK: F2_dfadd +define double @test3(double noundef %x) { +entry3: + %0 = tail call i32 @llvm.hexagon.F2.dfclass(double %x, i32 16) + %tobool.not = icmp eq i32 %0, 0 + %add = fadd double %x, %x + %spec.select = select i1 %tobool.not, double 1.000000e+00, double %add + ret double %spec.select +} + +; CHECK: bb.0.entry4 +; CHECK: F2_dfclass +; CHECK-NOT: C2_cmp +; CHECK: F2_dfadd +define double @test4(double noundef %x) { +entry4: + %0 = tail call i32 @llvm.hexagon.F2.dfclass(double %x, i32 16) + %tobool.not = icmp eq i32 %0, 0 + %add = fadd double %x, %x + %spec.select = select i1 %tobool.not, double %add, double 1.000000e+00 + ret double %spec.select +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare i32 @llvm.hexagon.F2.dfclass(double, i32 immarg) + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare i32 @llvm.hexagon.F2.sfclass(float, i32 immarg) diff --git a/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll b/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll index ba2118f..b3155c9 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll @@ -106,6 +106,69 @@ define void @ctlz_v4i64(ptr %src, ptr %dst) nounwind { ret void } +define void @not_ctlz_v32i8(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: not_ctlz_v32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvxori.b $xr0, $xr0, 255 +; CHECK-NEXT: xvclz.b $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <32 x i8>, ptr %src + %neg = xor <32 x i8> %v, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> + %res = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %neg, i1 false) + store <32 x i8> %res, ptr %dst + ret void +} + +define void @not_ctlz_v16i16(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: not_ctlz_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvrepli.b $xr1, -1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvclz.h $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <16 x i16>, ptr %src + %neg = xor <16 x i16> %v, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> + %res = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %neg, i1 false) + store <16 x i16> %res, ptr %dst + ret void +} + +define void @not_ctlz_v8i32(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: not_ctlz_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvrepli.b $xr1, -1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvclz.w $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <8 x i32>, ptr %src + %neg = xor <8 x i32> %v, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> + %res = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %neg, i1 false) + store <8 x i32> %res, ptr %dst + ret void +} + +define void @not_ctlz_v4i64(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: not_ctlz_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvrepli.b $xr1, -1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvclz.d $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <4 x i64>, ptr %src + %neg = xor <4 x i64> %v, <i64 -1, i64 -1, i64 -1, i64 -1> + %res = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %neg, i1 false) + store <4 x i64> %res, ptr %dst + ret void +} + declare <32 x i8> @llvm.ctpop.v32i8(<32 x i8>) declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>) declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) diff --git a/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll b/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll index 79407c3..fa5f27e 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll @@ -7,38 +7,8 @@ define void @ceil_v8f32(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: ceil_v8f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 5 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrp.s $vr1, $vr1 -; CHECK-NEXT: xvpickve.w $xr2, $xr0, 4 -; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0 -; CHECK-NEXT: vfrintrp.s $vr2, $vr2 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 16 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 6 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrp.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 32 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 7 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrp.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 48 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 1 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrp.s $vr1, $vr1 -; CHECK-NEXT: xvpickve.w $xr3, $xr0, 0 -; CHECK-NEXT: vreplvei.w $vr3, $vr3, 0 -; CHECK-NEXT: vfrintrp.s $vr3, $vr3 -; CHECK-NEXT: vextrins.w $vr3, $vr1, 16 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 2 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrp.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr3, $vr1, 32 -; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 -; CHECK-NEXT: vfrintrp.s $vr0, $vr0 -; CHECK-NEXT: vextrins.w $vr3, $vr0, 48 -; CHECK-NEXT: xvpermi.q $xr3, $xr2, 2 -; CHECK-NEXT: xvst $xr3, $a0, 0 +; CHECK-NEXT: xvfrintrp.s $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: %v0 = load <8 x float>, ptr %a0 @@ -52,21 +22,7 @@ define void @ceil_v4f64(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: ceil_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvpickve.d $xr1, $xr0, 3 -; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrp.d $vr1, $vr1 -; CHECK-NEXT: xvpickve.d $xr2, $xr0, 2 -; CHECK-NEXT: vreplvei.d $vr2, $vr2, 0 -; CHECK-NEXT: vfrintrp.d $vr2, $vr2 -; CHECK-NEXT: vextrins.d $vr2, $vr1, 16 -; CHECK-NEXT: xvpickve.d $xr1, $xr0, 1 -; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrp.d $vr1, $vr1 -; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 -; CHECK-NEXT: vfrintrp.d $vr0, $vr0 -; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 -; CHECK-NEXT: xvpermi.q $xr0, $xr2, 2 +; CHECK-NEXT: xvfrintrp.d $xr0, $xr0 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -81,38 +37,8 @@ define void @floor_v8f32(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: floor_v8f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 5 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrm.s $vr1, $vr1 -; CHECK-NEXT: xvpickve.w $xr2, $xr0, 4 -; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0 -; CHECK-NEXT: vfrintrm.s $vr2, $vr2 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 16 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 6 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrm.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 32 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 7 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrm.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 48 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 1 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrm.s $vr1, $vr1 -; CHECK-NEXT: xvpickve.w $xr3, $xr0, 0 -; CHECK-NEXT: vreplvei.w $vr3, $vr3, 0 -; CHECK-NEXT: vfrintrm.s $vr3, $vr3 -; CHECK-NEXT: vextrins.w $vr3, $vr1, 16 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 2 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrm.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr3, $vr1, 32 -; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 -; CHECK-NEXT: vfrintrm.s $vr0, $vr0 -; CHECK-NEXT: vextrins.w $vr3, $vr0, 48 -; CHECK-NEXT: xvpermi.q $xr3, $xr2, 2 -; CHECK-NEXT: xvst $xr3, $a0, 0 +; CHECK-NEXT: xvfrintrm.s $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: %v0 = load <8 x float>, ptr %a0 @@ -126,21 +52,7 @@ define void @floor_v4f64(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: floor_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvpickve.d $xr1, $xr0, 3 -; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrm.d $vr1, $vr1 -; CHECK-NEXT: xvpickve.d $xr2, $xr0, 2 -; CHECK-NEXT: vreplvei.d $vr2, $vr2, 0 -; CHECK-NEXT: vfrintrm.d $vr2, $vr2 -; CHECK-NEXT: vextrins.d $vr2, $vr1, 16 -; CHECK-NEXT: xvpickve.d $xr1, $xr0, 1 -; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrm.d $vr1, $vr1 -; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 -; CHECK-NEXT: vfrintrm.d $vr0, $vr0 -; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 -; CHECK-NEXT: xvpermi.q $xr0, $xr2, 2 +; CHECK-NEXT: xvfrintrm.d $xr0, $xr0 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -155,38 +67,8 @@ define void @trunc_v8f32(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: trunc_v8f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 5 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrz.s $vr1, $vr1 -; CHECK-NEXT: xvpickve.w $xr2, $xr0, 4 -; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0 -; CHECK-NEXT: vfrintrz.s $vr2, $vr2 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 16 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 6 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrz.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 32 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 7 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrz.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 48 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 1 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrz.s $vr1, $vr1 -; CHECK-NEXT: xvpickve.w $xr3, $xr0, 0 -; CHECK-NEXT: vreplvei.w $vr3, $vr3, 0 -; CHECK-NEXT: vfrintrz.s $vr3, $vr3 -; CHECK-NEXT: vextrins.w $vr3, $vr1, 16 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 2 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrz.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr3, $vr1, 32 -; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 -; CHECK-NEXT: vfrintrz.s $vr0, $vr0 -; CHECK-NEXT: vextrins.w $vr3, $vr0, 48 -; CHECK-NEXT: xvpermi.q $xr3, $xr2, 2 -; CHECK-NEXT: xvst $xr3, $a0, 0 +; CHECK-NEXT: xvfrintrz.s $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: %v0 = load <8 x float>, ptr %a0 @@ -200,21 +82,7 @@ define void @trunc_v4f64(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: trunc_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvpickve.d $xr1, $xr0, 3 -; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrz.d $vr1, $vr1 -; CHECK-NEXT: xvpickve.d $xr2, $xr0, 2 -; CHECK-NEXT: vreplvei.d $vr2, $vr2, 0 -; CHECK-NEXT: vfrintrz.d $vr2, $vr2 -; CHECK-NEXT: vextrins.d $vr2, $vr1, 16 -; CHECK-NEXT: xvpickve.d $xr1, $xr0, 1 -; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrz.d $vr1, $vr1 -; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 -; CHECK-NEXT: vfrintrz.d $vr0, $vr0 -; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 -; CHECK-NEXT: xvpermi.q $xr0, $xr2, 2 +; CHECK-NEXT: xvfrintrz.d $xr0, $xr0 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -229,38 +97,8 @@ define void @roundeven_v8f32(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: roundeven_v8f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 5 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrne.s $vr1, $vr1 -; CHECK-NEXT: xvpickve.w $xr2, $xr0, 4 -; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0 -; CHECK-NEXT: vfrintrne.s $vr2, $vr2 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 16 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 6 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrne.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 32 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 7 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrne.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 48 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 1 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrne.s $vr1, $vr1 -; CHECK-NEXT: xvpickve.w $xr3, $xr0, 0 -; CHECK-NEXT: vreplvei.w $vr3, $vr3, 0 -; CHECK-NEXT: vfrintrne.s $vr3, $vr3 -; CHECK-NEXT: vextrins.w $vr3, $vr1, 16 -; CHECK-NEXT: xvpickve.w $xr1, $xr0, 2 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrne.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr3, $vr1, 32 -; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 -; CHECK-NEXT: vfrintrne.s $vr0, $vr0 -; CHECK-NEXT: vextrins.w $vr3, $vr0, 48 -; CHECK-NEXT: xvpermi.q $xr3, $xr2, 2 -; CHECK-NEXT: xvst $xr3, $a0, 0 +; CHECK-NEXT: xvfrintrne.s $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: %v0 = load <8 x float>, ptr %a0 @@ -274,21 +112,7 @@ define void @roundeven_v4f64(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: roundeven_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvpickve.d $xr1, $xr0, 3 -; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrne.d $vr1, $vr1 -; CHECK-NEXT: xvpickve.d $xr2, $xr0, 2 -; CHECK-NEXT: vreplvei.d $vr2, $vr2, 0 -; CHECK-NEXT: vfrintrne.d $vr2, $vr2 -; CHECK-NEXT: vextrins.d $vr2, $vr1, 16 -; CHECK-NEXT: xvpickve.d $xr1, $xr0, 1 -; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrne.d $vr1, $vr1 -; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 -; CHECK-NEXT: vfrintrne.d $vr0, $vr0 -; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 -; CHECK-NEXT: xvpermi.q $xr0, $xr2, 2 +; CHECK-NEXT: xvfrintrne.d $xr0, $xr0 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll b/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll index a9a38e8..6ac7d51 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll @@ -106,6 +106,69 @@ define void @ctlz_v2i64(ptr %src, ptr %dst) nounwind { ret void } +define void @not_ctlz_v16i8(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: not_ctlz_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a0, 0 +; CHECK-NEXT: vxori.b $vr0, $vr0, 255 +; CHECK-NEXT: vclz.b $vr0, $vr0 +; CHECK-NEXT: vst $vr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <16 x i8>, ptr %src + %neg = xor <16 x i8> %v, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> + %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %neg, i1 false) + store <16 x i8> %res, ptr %dst + ret void +} + +define void @not_ctlz_v8i16(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: not_ctlz_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a0, 0 +; CHECK-NEXT: vrepli.b $vr1, -1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vclz.h $vr0, $vr0 +; CHECK-NEXT: vst $vr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <8 x i16>, ptr %src + %neg = xor <8 x i16> %v, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> + %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %neg, i1 false) + store <8 x i16> %res, ptr %dst + ret void +} + +define void @not_ctlz_v4i32(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: not_ctlz_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a0, 0 +; CHECK-NEXT: vrepli.b $vr1, -1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vclz.w $vr0, $vr0 +; CHECK-NEXT: vst $vr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <4 x i32>, ptr %src + %neg = xor <4 x i32> %v, <i32 -1, i32 -1, i32 -1, i32 -1> + %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %neg, i1 false) + store <4 x i32> %res, ptr %dst + ret void +} + +define void @not_ctlz_v2i64(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: not_ctlz_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a0, 0 +; CHECK-NEXT: vrepli.b $vr1, -1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vclz.d $vr0, $vr0 +; CHECK-NEXT: vst $vr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <2 x i64>, ptr %src + %neg = xor <2 x i64> %v, <i64 -1, i64 -1> + %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %neg, i1 false) + store <2 x i64> %res, ptr %dst + ret void +} + declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) diff --git a/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll b/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll index 1ca6290..cb01ac0 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll @@ -7,22 +7,8 @@ define void @ceil_v4f32(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: ceil_v4f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vreplvei.w $vr1, $vr0, 1 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrp.s $vr1, $vr1 -; CHECK-NEXT: vreplvei.w $vr2, $vr0, 0 -; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0 -; CHECK-NEXT: vfrintrp.s $vr2, $vr2 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 16 -; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrp.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 32 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 ; CHECK-NEXT: vfrintrp.s $vr0, $vr0 -; CHECK-NEXT: vextrins.w $vr2, $vr0, 48 -; CHECK-NEXT: vst $vr2, $a0, 0 +; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: %v0 = load <4 x float>, ptr %a0 @@ -36,13 +22,7 @@ define void @ceil_v2f64(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: ceil_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vreplvei.d $vr1, $vr0, 1 -; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrp.d $vr1, $vr1 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 ; CHECK-NEXT: vfrintrp.d $vr0, $vr0 -; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -57,22 +37,8 @@ define void @floor_v4f32(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: floor_v4f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vreplvei.w $vr1, $vr0, 1 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrm.s $vr1, $vr1 -; CHECK-NEXT: vreplvei.w $vr2, $vr0, 0 -; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0 -; CHECK-NEXT: vfrintrm.s $vr2, $vr2 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 16 -; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrm.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 32 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 ; CHECK-NEXT: vfrintrm.s $vr0, $vr0 -; CHECK-NEXT: vextrins.w $vr2, $vr0, 48 -; CHECK-NEXT: vst $vr2, $a0, 0 +; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: %v0 = load <4 x float>, ptr %a0 @@ -86,13 +52,7 @@ define void @floor_v2f64(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: floor_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vreplvei.d $vr1, $vr0, 1 -; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrm.d $vr1, $vr1 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 ; CHECK-NEXT: vfrintrm.d $vr0, $vr0 -; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -107,22 +67,8 @@ define void @trunc_v4f32(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: trunc_v4f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vreplvei.w $vr1, $vr0, 1 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrz.s $vr1, $vr1 -; CHECK-NEXT: vreplvei.w $vr2, $vr0, 0 -; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0 -; CHECK-NEXT: vfrintrz.s $vr2, $vr2 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 16 -; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrz.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 32 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 ; CHECK-NEXT: vfrintrz.s $vr0, $vr0 -; CHECK-NEXT: vextrins.w $vr2, $vr0, 48 -; CHECK-NEXT: vst $vr2, $a0, 0 +; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: %v0 = load <4 x float>, ptr %a0 @@ -136,13 +82,7 @@ define void @trunc_v2f64(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: trunc_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vreplvei.d $vr1, $vr0, 1 -; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrz.d $vr1, $vr1 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 ; CHECK-NEXT: vfrintrz.d $vr0, $vr0 -; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -157,22 +97,8 @@ define void @roundeven_v4f32(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: roundeven_v4f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vreplvei.w $vr1, $vr0, 1 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrne.s $vr1, $vr1 -; CHECK-NEXT: vreplvei.w $vr2, $vr0, 0 -; CHECK-NEXT: vreplvei.w $vr2, $vr2, 0 -; CHECK-NEXT: vfrintrne.s $vr2, $vr2 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 16 -; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2 -; CHECK-NEXT: vreplvei.w $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrne.s $vr1, $vr1 -; CHECK-NEXT: vextrins.w $vr2, $vr1, 32 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 ; CHECK-NEXT: vfrintrne.s $vr0, $vr0 -; CHECK-NEXT: vextrins.w $vr2, $vr0, 48 -; CHECK-NEXT: vst $vr2, $a0, 0 +; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: %v0 = load <4 x float>, ptr %a0 @@ -186,13 +112,7 @@ define void @roundeven_v2f64(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: roundeven_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vreplvei.d $vr1, $vr0, 1 -; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 -; CHECK-NEXT: vfrintrne.d $vr1, $vr1 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 ; CHECK-NEXT: vfrintrne.d $vr0, $vr0 -; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll b/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll new file mode 100644 index 0000000..9a806a1 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll @@ -0,0 +1,758 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx --verify-machineinstrs < %s \ +; RUN: | FileCheck --check-prefix=LA32 %s +; RUN: llc --mtriple=loongarch64 --mattr=+lasx --verify-machineinstrs < %s \ +; RUN: | FileCheck --check-prefix=LA64 %s + +%struct.S = type { i64, i64, i8 } +%struct.F = type { float, double, float } +%struct.V = type { <4 x i32>, <4 x i32>, <16 x i16> } + +define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind { +; LA32-LABEL: sink_fold_i64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -48 +; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill +; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill +; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill +; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill +; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill +; LA32-NEXT: st.w $s5, $sp, 16 # 4-byte Folded Spill +; LA32-NEXT: st.w $s6, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: move $s0, $a3 +; LA32-NEXT: move $s1, $a2 +; LA32-NEXT: slli.w $a1, $a0, 4 +; LA32-NEXT: alsl.w $a0, $a0, $a1, 3 +; LA32-NEXT: add.w $a0, $a4, $a0 +; LA32-NEXT: sltui $a1, $a3, 1 +; LA32-NEXT: slti $a2, $a3, 0 +; LA32-NEXT: masknez $a2, $a2, $a1 +; LA32-NEXT: sltui $a3, $s1, 1 +; LA32-NEXT: maskeqz $a1, $a3, $a1 +; LA32-NEXT: or $a1, $a1, $a2 +; LA32-NEXT: addi.w $s2, $a0, 8 +; LA32-NEXT: bnez $a1, .LBB0_3 +; LA32-NEXT: # %bb.1: # %for.body.preheader +; LA32-NEXT: move $fp, $a4 +; LA32-NEXT: move $s4, $zero +; LA32-NEXT: move $s5, $zero +; LA32-NEXT: move $s3, $zero +; LA32-NEXT: move $s6, $zero +; LA32-NEXT: .p2align 4, , 16 +; LA32-NEXT: .LBB0_2: # %for.body +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl f +; LA32-NEXT: ld.w $a0, $s2, 4 +; LA32-NEXT: ld.w $a1, $s2, 0 +; LA32-NEXT: add.w $a0, $a0, $s6 +; LA32-NEXT: add.w $s3, $a1, $s3 +; LA32-NEXT: sltu $a1, $s3, $a1 +; LA32-NEXT: addi.w $s4, $s4, 1 +; LA32-NEXT: sltui $a2, $s4, 1 +; LA32-NEXT: add.w $s5, $s5, $a2 +; LA32-NEXT: xor $a2, $s4, $s1 +; LA32-NEXT: xor $a3, $s5, $s0 +; LA32-NEXT: or $a2, $a2, $a3 +; LA32-NEXT: add.w $s6, $a0, $a1 +; LA32-NEXT: bnez $a2, .LBB0_2 +; LA32-NEXT: b .LBB0_4 +; LA32-NEXT: .LBB0_3: +; LA32-NEXT: move $s3, $zero +; LA32-NEXT: move $s6, $zero +; LA32-NEXT: .LBB0_4: # %for.cond.cleanup +; LA32-NEXT: st.w $s3, $s2, 0 +; LA32-NEXT: st.w $s6, $s2, 4 +; LA32-NEXT: ld.w $s6, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s5, $sp, 16 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 48 +; LA32-NEXT: ret +; +; LA64-LABEL: sink_fold_i64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -48 +; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill +; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill +; LA64-NEXT: st.d $s2, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: move $s0, $a1 +; LA64-NEXT: slli.d $a1, $a0, 4 +; LA64-NEXT: alsl.d $a0, $a0, $a1, 3 +; LA64-NEXT: add.d $a0, $a2, $a0 +; LA64-NEXT: addi.d $s1, $a0, 8 +; LA64-NEXT: blez $s0, .LBB0_3 +; LA64-NEXT: # %bb.1: # %for.body.preheader +; LA64-NEXT: move $fp, $a2 +; LA64-NEXT: move $s2, $zero +; LA64-NEXT: .p2align 4, , 16 +; LA64-NEXT: .LBB0_2: # %for.body +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(f) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: ld.d $a0, $s1, 0 +; LA64-NEXT: addi.d $s0, $s0, -1 +; LA64-NEXT: add.d $s2, $a0, $s2 +; LA64-NEXT: bnez $s0, .LBB0_2 +; LA64-NEXT: b .LBB0_4 +; LA64-NEXT: .LBB0_3: +; LA64-NEXT: move $s2, $zero +; LA64-NEXT: .LBB0_4: # %for.cond.cleanup +; LA64-NEXT: st.d $s2, $s1, 0 +; LA64-NEXT: ld.d $s2, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 48 +; LA64-NEXT: ret +entry: + %y = getelementptr inbounds %struct.S, ptr %a, i64 %k, i32 1 + %cmp4 = icmp sgt i64 %n, 0 + br i1 %cmp4, label %for.body, label %for.cond.cleanup + +for.body: ; preds = %entry, %for.body + %i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %s.05 = phi i64 [ 0, %entry ], [ %add, %for.body ] + call void @f(ptr %a) + %0 = load i64, ptr %y + %add = add nsw i64 %0, %s.05 + %inc = add nuw nsw i64 %i.06, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %s.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ] + store i64 %s.0.lcssa, ptr %y + ret void +} + +define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind { +; LA32-LABEL: sink_fold_f32: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -48 +; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill +; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill +; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill +; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill +; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill +; LA32-NEXT: fst.d $fs0, $sp, 8 # 8-byte Folded Spill +; LA32-NEXT: move $s0, $a3 +; LA32-NEXT: move $s1, $a2 +; LA32-NEXT: slli.w $a1, $a0, 4 +; LA32-NEXT: alsl.w $a0, $a0, $a1, 3 +; LA32-NEXT: add.w $a0, $a4, $a0 +; LA32-NEXT: sltui $a1, $a3, 1 +; LA32-NEXT: slti $a2, $a3, 0 +; LA32-NEXT: masknez $a2, $a2, $a1 +; LA32-NEXT: sltui $a3, $s1, 1 +; LA32-NEXT: maskeqz $a1, $a3, $a1 +; LA32-NEXT: or $a1, $a1, $a2 +; LA32-NEXT: addi.w $s2, $a0, 16 +; LA32-NEXT: bnez $a1, .LBB1_3 +; LA32-NEXT: # %bb.1: # %for.body.preheader +; LA32-NEXT: move $fp, $a4 +; LA32-NEXT: move $s3, $zero +; LA32-NEXT: move $s4, $zero +; LA32-NEXT: movgr2fr.w $fs0, $zero +; LA32-NEXT: .p2align 4, , 16 +; LA32-NEXT: .LBB1_2: # %for.body +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl f +; LA32-NEXT: fld.s $fa0, $s2, 0 +; LA32-NEXT: addi.w $s3, $s3, 1 +; LA32-NEXT: sltui $a0, $s3, 1 +; LA32-NEXT: add.w $s4, $s4, $a0 +; LA32-NEXT: xor $a0, $s3, $s1 +; LA32-NEXT: xor $a1, $s4, $s0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: fadd.s $fs0, $fa0, $fs0 +; LA32-NEXT: bnez $a0, .LBB1_2 +; LA32-NEXT: b .LBB1_4 +; LA32-NEXT: .LBB1_3: +; LA32-NEXT: movgr2fr.w $fs0, $zero +; LA32-NEXT: .LBB1_4: # %for.cond.cleanup +; LA32-NEXT: fst.s $fs0, $s2, 0 +; LA32-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload +; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 48 +; LA32-NEXT: ret +; +; LA64-LABEL: sink_fold_f32: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -48 +; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill +; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill +; LA64-NEXT: fst.d $fs0, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: move $s0, $a1 +; LA64-NEXT: slli.d $a1, $a0, 4 +; LA64-NEXT: alsl.d $a0, $a0, $a1, 3 +; LA64-NEXT: add.d $a0, $a2, $a0 +; LA64-NEXT: addi.d $s1, $a0, 16 +; LA64-NEXT: blez $s0, .LBB1_3 +; LA64-NEXT: # %bb.1: # %for.body.preheader +; LA64-NEXT: move $fp, $a2 +; LA64-NEXT: movgr2fr.w $fs0, $zero +; LA64-NEXT: .p2align 4, , 16 +; LA64-NEXT: .LBB1_2: # %for.body +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(f) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: fld.s $fa0, $s1, 0 +; LA64-NEXT: addi.d $s0, $s0, -1 +; LA64-NEXT: fadd.s $fs0, $fa0, $fs0 +; LA64-NEXT: bnez $s0, .LBB1_2 +; LA64-NEXT: b .LBB1_4 +; LA64-NEXT: .LBB1_3: +; LA64-NEXT: movgr2fr.w $fs0, $zero +; LA64-NEXT: .LBB1_4: # %for.cond.cleanup +; LA64-NEXT: fst.s $fs0, $s1, 0 +; LA64-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 48 +; LA64-NEXT: ret +entry: + %y = getelementptr inbounds %struct.F, ptr %a, i64 %k, i32 2 + %cmp4 = icmp sgt i64 %n, 0 + br i1 %cmp4, label %for.body, label %for.cond.cleanup + +for.body: ; preds = %entry, %for.body + %i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %s.05 = phi float [ 0.0, %entry ], [ %add, %for.body ] + call void @f(ptr %a) + %0 = load float, ptr %y + %add = fadd float %0, %s.05 + %inc = add nuw nsw i64 %i.06, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %s.0.lcssa = phi float [ 0.0, %entry ], [ %add, %for.body ] + store float %s.0.lcssa, ptr %y + ret void +} + +define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind { +; LA32-LABEL: sink_fold_v4i32: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -48 +; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill +; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill +; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill +; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill +; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill +; LA32-NEXT: move $s0, $a3 +; LA32-NEXT: move $s1, $a2 +; LA32-NEXT: slli.w $a0, $a0, 6 +; LA32-NEXT: add.w $a0, $a4, $a0 +; LA32-NEXT: sltui $a1, $a3, 1 +; LA32-NEXT: slti $a2, $a3, 0 +; LA32-NEXT: masknez $a2, $a2, $a1 +; LA32-NEXT: sltui $a3, $s1, 1 +; LA32-NEXT: maskeqz $a1, $a3, $a1 +; LA32-NEXT: or $a1, $a1, $a2 +; LA32-NEXT: addi.w $s2, $a0, 16 +; LA32-NEXT: bnez $a1, .LBB2_3 +; LA32-NEXT: # %bb.1: # %for.body.preheader +; LA32-NEXT: move $fp, $a4 +; LA32-NEXT: move $s3, $zero +; LA32-NEXT: move $s4, $zero +; LA32-NEXT: vrepli.b $vr0, 0 +; LA32-NEXT: .p2align 4, , 16 +; LA32-NEXT: .LBB2_2: # %for.body +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl f +; LA32-NEXT: vld $vr0, $s2, 0 +; LA32-NEXT: addi.w $s3, $s3, 1 +; LA32-NEXT: sltui $a0, $s3, 1 +; LA32-NEXT: add.w $s4, $s4, $a0 +; LA32-NEXT: xor $a0, $s3, $s1 +; LA32-NEXT: xor $a1, $s4, $s0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload +; LA32-NEXT: vadd.w $vr1, $vr0, $vr1 +; LA32-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill +; LA32-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload +; LA32-NEXT: bnez $a0, .LBB2_2 +; LA32-NEXT: b .LBB2_4 +; LA32-NEXT: .LBB2_3: +; LA32-NEXT: vrepli.b $vr0, 0 +; LA32-NEXT: .LBB2_4: # %for.cond.cleanup +; LA32-NEXT: vst $vr0, $s2, 0 +; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 48 +; LA32-NEXT: ret +; +; LA64-LABEL: sink_fold_v4i32: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -48 +; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill +; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill +; LA64-NEXT: slli.d $a0, $a0, 6 +; LA64-NEXT: add.d $a0, $a2, $a0 +; LA64-NEXT: addi.d $s1, $a0, 16 +; LA64-NEXT: blez $a1, .LBB2_3 +; LA64-NEXT: # %bb.1: # %for.body.preheader +; LA64-NEXT: move $fp, $a2 +; LA64-NEXT: move $s0, $a1 +; LA64-NEXT: vrepli.b $vr0, 0 +; LA64-NEXT: .p2align 4, , 16 +; LA64-NEXT: .LBB2_2: # %for.body +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(f) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: vld $vr0, $s1, 0 +; LA64-NEXT: addi.d $s0, $s0, -1 +; LA64-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload +; LA64-NEXT: vadd.w $vr1, $vr0, $vr1 +; LA64-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill +; LA64-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload +; LA64-NEXT: bnez $s0, .LBB2_2 +; LA64-NEXT: b .LBB2_4 +; LA64-NEXT: .LBB2_3: +; LA64-NEXT: vrepli.b $vr0, 0 +; LA64-NEXT: .LBB2_4: # %for.cond.cleanup +; LA64-NEXT: vst $vr0, $s1, 0 +; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 48 +; LA64-NEXT: ret +entry: + %y = getelementptr inbounds %struct.V, ptr %a, i64 %k, i32 1 + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.body: ; preds = %entry, %for.body + %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %sum.0 = phi <4 x i32> [ zeroinitializer, %entry ], [ %addv, %for.body ] + call void @f(ptr %a) + %v = load <4 x i32>, ptr %y + %addv = add <4 x i32> %v, %sum.0 + %inc = add nuw nsw i64 %i.0, 1 + %exitcond = icmp eq i64 %inc, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %sum.lcssa = phi <4 x i32> [ zeroinitializer, %entry ], [ %addv, %for.body ] + store <4 x i32> %sum.lcssa, ptr %y + ret void +} + +define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind { +; LA32-LABEL: sink_fold_v16i16: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -80 +; LA32-NEXT: st.w $ra, $sp, 76 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 72 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 68 # 4-byte Folded Spill +; LA32-NEXT: st.w $s1, $sp, 64 # 4-byte Folded Spill +; LA32-NEXT: st.w $s2, $sp, 60 # 4-byte Folded Spill +; LA32-NEXT: st.w $s3, $sp, 56 # 4-byte Folded Spill +; LA32-NEXT: st.w $s4, $sp, 52 # 4-byte Folded Spill +; LA32-NEXT: move $s0, $a3 +; LA32-NEXT: move $s1, $a2 +; LA32-NEXT: slli.w $a0, $a0, 6 +; LA32-NEXT: add.w $a0, $a4, $a0 +; LA32-NEXT: sltui $a1, $a3, 1 +; LA32-NEXT: slti $a2, $a3, 0 +; LA32-NEXT: masknez $a2, $a2, $a1 +; LA32-NEXT: sltui $a3, $s1, 1 +; LA32-NEXT: maskeqz $a1, $a3, $a1 +; LA32-NEXT: or $a1, $a1, $a2 +; LA32-NEXT: addi.w $s2, $a0, 32 +; LA32-NEXT: bnez $a1, .LBB3_3 +; LA32-NEXT: # %bb.1: # %for.body.preheader +; LA32-NEXT: move $fp, $a4 +; LA32-NEXT: move $s3, $zero +; LA32-NEXT: move $s4, $zero +; LA32-NEXT: xvrepli.b $xr0, 0 +; LA32-NEXT: .p2align 4, , 16 +; LA32-NEXT: .LBB3_2: # %for.body +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl f +; LA32-NEXT: xvld $xr0, $s2, 0 +; LA32-NEXT: addi.w $s3, $s3, 1 +; LA32-NEXT: sltui $a0, $s3, 1 +; LA32-NEXT: add.w $s4, $s4, $a0 +; LA32-NEXT: xor $a0, $s3, $s1 +; LA32-NEXT: xor $a1, $s4, $s0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload +; LA32-NEXT: xvadd.h $xr1, $xr0, $xr1 +; LA32-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill +; LA32-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload +; LA32-NEXT: bnez $a0, .LBB3_2 +; LA32-NEXT: b .LBB3_4 +; LA32-NEXT: .LBB3_3: +; LA32-NEXT: xvrepli.b $xr0, 0 +; LA32-NEXT: .LBB3_4: # %for.cond.cleanup +; LA32-NEXT: xvst $xr0, $s2, 0 +; LA32-NEXT: ld.w $s4, $sp, 52 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s3, $sp, 56 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s2, $sp, 60 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s1, $sp, 64 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s0, $sp, 68 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 72 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 76 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 80 +; LA32-NEXT: ret +; +; LA64-LABEL: sink_fold_v16i16: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -80 +; LA64-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 64 # 8-byte Folded Spill +; LA64-NEXT: st.d $s0, $sp, 56 # 8-byte Folded Spill +; LA64-NEXT: st.d $s1, $sp, 48 # 8-byte Folded Spill +; LA64-NEXT: slli.d $a0, $a0, 6 +; LA64-NEXT: add.d $a0, $a2, $a0 +; LA64-NEXT: addi.d $s1, $a0, 32 +; LA64-NEXT: blez $a1, .LBB3_3 +; LA64-NEXT: # %bb.1: # %for.body.preheader +; LA64-NEXT: move $fp, $a2 +; LA64-NEXT: move $s0, $a1 +; LA64-NEXT: xvrepli.b $xr0, 0 +; LA64-NEXT: .p2align 4, , 16 +; LA64-NEXT: .LBB3_2: # %for.body +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(f) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: xvld $xr0, $s1, 0 +; LA64-NEXT: addi.d $s0, $s0, -1 +; LA64-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload +; LA64-NEXT: xvadd.h $xr1, $xr0, $xr1 +; LA64-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill +; LA64-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload +; LA64-NEXT: bnez $s0, .LBB3_2 +; LA64-NEXT: b .LBB3_4 +; LA64-NEXT: .LBB3_3: +; LA64-NEXT: xvrepli.b $xr0, 0 +; LA64-NEXT: .LBB3_4: # %for.cond.cleanup +; LA64-NEXT: xvst $xr0, $s1, 0 +; LA64-NEXT: ld.d $s1, $sp, 48 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s0, $sp, 56 # 8-byte Folded Reload +; LA64-NEXT: ld.d $fp, $sp, 64 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 80 +; LA64-NEXT: ret +entry: + %y = getelementptr inbounds %struct.V, ptr %a, i64 %k, i32 2 + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.body: ; preds = %entry, %for.body + %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %sum.0 = phi <16 x i16> [ zeroinitializer, %entry ], [ %addv, %for.body ] + call void @f(ptr %a) + %v = load <16 x i16>, ptr %y + %addv = add <16 x i16> %v, %sum.0 + %inc = add nuw nsw i64 %i.0, 1 + %exitcond = icmp eq i64 %inc, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %sum.lcssa = phi <16 x i16> [ zeroinitializer, %entry ], [ %addv, %for.body ] + store <16 x i16> %sum.lcssa, ptr %y + ret void +} + +define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind { +; LA32-LABEL: sink_fold_extracti8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -48 +; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill +; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill +; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill +; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill +; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill +; LA32-NEXT: move $s0, $a3 +; LA32-NEXT: move $s1, $a2 +; LA32-NEXT: slli.w $a1, $a0, 4 +; LA32-NEXT: alsl.w $a0, $a0, $a1, 3 +; LA32-NEXT: add.w $a0, $a4, $a0 +; LA32-NEXT: sltui $a1, $a3, 1 +; LA32-NEXT: slti $a2, $a3, 0 +; LA32-NEXT: masknez $a2, $a2, $a1 +; LA32-NEXT: sltui $a3, $s1, 1 +; LA32-NEXT: maskeqz $a1, $a3, $a1 +; LA32-NEXT: or $a1, $a1, $a2 +; LA32-NEXT: addi.w $s2, $a0, 16 +; LA32-NEXT: bnez $a1, .LBB4_3 +; LA32-NEXT: # %bb.1: # %for.body.preheader +; LA32-NEXT: move $fp, $a4 +; LA32-NEXT: move $s3, $zero +; LA32-NEXT: move $s4, $zero +; LA32-NEXT: vrepli.b $vr0, 0 +; LA32-NEXT: .p2align 4, , 16 +; LA32-NEXT: .LBB4_2: # %for.body +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl f +; LA32-NEXT: vldrepl.b $vr0, $s2, 0 +; LA32-NEXT: addi.w $s3, $s3, 1 +; LA32-NEXT: sltui $a0, $s3, 1 +; LA32-NEXT: add.w $s4, $s4, $a0 +; LA32-NEXT: xor $a0, $s3, $s1 +; LA32-NEXT: xor $a1, $s4, $s0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload +; LA32-NEXT: vadd.b $vr1, $vr0, $vr1 +; LA32-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill +; LA32-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload +; LA32-NEXT: bnez $a0, .LBB4_2 +; LA32-NEXT: b .LBB4_4 +; LA32-NEXT: .LBB4_3: +; LA32-NEXT: vrepli.b $vr0, 0 +; LA32-NEXT: .LBB4_4: # %for.cond.cleanup +; LA32-NEXT: vstelm.b $vr0, $s2, 0, 1 +; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 48 +; LA32-NEXT: ret +; +; LA64-LABEL: sink_fold_extracti8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -48 +; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill +; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill +; LA64-NEXT: move $s0, $a1 +; LA64-NEXT: slli.d $a1, $a0, 4 +; LA64-NEXT: alsl.d $a0, $a0, $a1, 3 +; LA64-NEXT: add.d $a0, $a2, $a0 +; LA64-NEXT: addi.d $s1, $a0, 16 +; LA64-NEXT: blez $s0, .LBB4_3 +; LA64-NEXT: # %bb.1: # %for.body.preheader +; LA64-NEXT: move $fp, $a2 +; LA64-NEXT: vrepli.b $vr0, 0 +; LA64-NEXT: .p2align 4, , 16 +; LA64-NEXT: .LBB4_2: # %for.body +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(f) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: vldrepl.b $vr0, $s1, 0 +; LA64-NEXT: addi.d $s0, $s0, -1 +; LA64-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload +; LA64-NEXT: vadd.b $vr1, $vr0, $vr1 +; LA64-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill +; LA64-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload +; LA64-NEXT: bnez $s0, .LBB4_2 +; LA64-NEXT: b .LBB4_4 +; LA64-NEXT: .LBB4_3: +; LA64-NEXT: vrepli.b $vr0, 0 +; LA64-NEXT: .LBB4_4: # %for.cond.cleanup +; LA64-NEXT: vstelm.b $vr0, $s1, 0, 1 +; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 48 +; LA64-NEXT: ret +entry: + %y = getelementptr inbounds %struct.S, ptr %a, i64 %k, i32 2 + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.body: ; preds = %entry, %for.body + %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %sum.0 = phi <16 x i8> [ zeroinitializer, %entry ], [ %addv, %for.body ] + call void @f(ptr %a) + %e = load i8, ptr %y + %ins0 = insertelement <16 x i8> poison, i8 %e, i32 0 + %v = shufflevector <16 x i8> %ins0, <16 x i8> poison, <16 x i32> zeroinitializer + %addv = add <16 x i8> %v, %sum.0 + %inc = add nuw nsw i64 %i.0, 1 + %exitcond = icmp eq i64 %inc, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %sum.lcssa = phi <16 x i8> [ zeroinitializer, %entry ], [ %addv, %for.body ] + %res = extractelement <16 x i8> %sum.lcssa, i32 1 + store i8 %res, ptr %y + ret void +} + +define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind { +; LA32-LABEL: sink_fold_extractf64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -80 +; LA32-NEXT: st.w $ra, $sp, 76 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 72 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 68 # 4-byte Folded Spill +; LA32-NEXT: st.w $s1, $sp, 64 # 4-byte Folded Spill +; LA32-NEXT: st.w $s2, $sp, 60 # 4-byte Folded Spill +; LA32-NEXT: st.w $s3, $sp, 56 # 4-byte Folded Spill +; LA32-NEXT: st.w $s4, $sp, 52 # 4-byte Folded Spill +; LA32-NEXT: move $s0, $a3 +; LA32-NEXT: move $s1, $a2 +; LA32-NEXT: slli.w $a1, $a0, 4 +; LA32-NEXT: alsl.w $a0, $a0, $a1, 3 +; LA32-NEXT: add.w $a0, $a4, $a0 +; LA32-NEXT: sltui $a1, $a3, 1 +; LA32-NEXT: slti $a2, $a3, 0 +; LA32-NEXT: masknez $a2, $a2, $a1 +; LA32-NEXT: sltui $a3, $s1, 1 +; LA32-NEXT: maskeqz $a1, $a3, $a1 +; LA32-NEXT: or $a1, $a1, $a2 +; LA32-NEXT: addi.w $s2, $a0, 8 +; LA32-NEXT: bnez $a1, .LBB5_3 +; LA32-NEXT: # %bb.1: # %for.body.preheader +; LA32-NEXT: move $fp, $a4 +; LA32-NEXT: move $s3, $zero +; LA32-NEXT: move $s4, $zero +; LA32-NEXT: xvrepli.b $xr0, 0 +; LA32-NEXT: .p2align 4, , 16 +; LA32-NEXT: .LBB5_2: # %for.body +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl f +; LA32-NEXT: xvldrepl.d $xr0, $s2, 0 +; LA32-NEXT: addi.w $s3, $s3, 1 +; LA32-NEXT: sltui $a0, $s3, 1 +; LA32-NEXT: add.w $s4, $s4, $a0 +; LA32-NEXT: xor $a0, $s3, $s1 +; LA32-NEXT: xor $a1, $s4, $s0 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload +; LA32-NEXT: xvfadd.d $xr1, $xr0, $xr1 +; LA32-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill +; LA32-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload +; LA32-NEXT: bnez $a0, .LBB5_2 +; LA32-NEXT: b .LBB5_4 +; LA32-NEXT: .LBB5_3: +; LA32-NEXT: xvrepli.b $xr0, 0 +; LA32-NEXT: .LBB5_4: # %for.cond.cleanup +; LA32-NEXT: xvstelm.d $xr0, $s2, 0, 1 +; LA32-NEXT: ld.w $s4, $sp, 52 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s3, $sp, 56 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s2, $sp, 60 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s1, $sp, 64 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s0, $sp, 68 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 72 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 76 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 80 +; LA32-NEXT: ret +; +; LA64-LABEL: sink_fold_extractf64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -80 +; LA64-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 64 # 8-byte Folded Spill +; LA64-NEXT: st.d $s0, $sp, 56 # 8-byte Folded Spill +; LA64-NEXT: st.d $s1, $sp, 48 # 8-byte Folded Spill +; LA64-NEXT: move $s0, $a1 +; LA64-NEXT: slli.d $a1, $a0, 4 +; LA64-NEXT: alsl.d $a0, $a0, $a1, 3 +; LA64-NEXT: add.d $a0, $a2, $a0 +; LA64-NEXT: addi.d $s1, $a0, 8 +; LA64-NEXT: blez $s0, .LBB5_3 +; LA64-NEXT: # %bb.1: # %for.body.preheader +; LA64-NEXT: move $fp, $a2 +; LA64-NEXT: xvrepli.b $xr0, 0 +; LA64-NEXT: .p2align 4, , 16 +; LA64-NEXT: .LBB5_2: # %for.body +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(f) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: xvldrepl.d $xr0, $s1, 0 +; LA64-NEXT: addi.d $s0, $s0, -1 +; LA64-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload +; LA64-NEXT: xvfadd.d $xr1, $xr0, $xr1 +; LA64-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill +; LA64-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload +; LA64-NEXT: bnez $s0, .LBB5_2 +; LA64-NEXT: b .LBB5_4 +; LA64-NEXT: .LBB5_3: +; LA64-NEXT: xvrepli.b $xr0, 0 +; LA64-NEXT: .LBB5_4: # %for.cond.cleanup +; LA64-NEXT: xvstelm.d $xr0, $s1, 0, 1 +; LA64-NEXT: ld.d $s1, $sp, 48 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s0, $sp, 56 # 8-byte Folded Reload +; LA64-NEXT: ld.d $fp, $sp, 64 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 80 +; LA64-NEXT: ret +entry: + %y = getelementptr inbounds %struct.F, ptr %a, i64 %k, i32 1 + %cmp = icmp sgt i64 %n, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.body: ; preds = %entry, %for.body + %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %sum.0 = phi <4 x double> [ zeroinitializer, %entry ], [ %addv, %for.body ] + call void @f(ptr %a) + %e = load double, ptr %y + %ins0 = insertelement <4 x double> poison, double %e, i32 0 + %v = shufflevector <4 x double> %ins0, <4 x double> poison, <4 x i32> zeroinitializer + %addv = fadd <4 x double> %v, %sum.0 + %inc = add nuw nsw i64 %i.0, 1 + %exitcond = icmp eq i64 %inc, %n + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %sum.lcssa = phi <4 x double> [ zeroinitializer, %entry ], [ %addv, %for.body ] + %res = extractelement <4 x double> %sum.lcssa, i32 1 + store double %res, ptr %y + ret void +} + +declare void @f(ptr) diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt index d3c0da9..000c67ef 100644 --- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt +++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt @@ -1439,11 +1439,8 @@ Key: PSUBWrm: [ 0.00 0.00 ] Key: PSUBWrr: [ 0.00 0.00 ] Key: PSWAPDrm: [ 0.00 0.00 ] Key: PSWAPDrr: [ 0.00 0.00 ] -Key: PT: [ 0.00 0.00 ] Key: PTCMMIMFP: [ 0.00 0.00 ] Key: PTCMMRLFP: [ 0.00 0.00 ] -Key: PTCONJTCMMIMFP: [ 0.00 0.00 ] -Key: PTCONJTFP: [ 0.00 0.00 ] Key: PTCVTROWD: [ 0.00 0.00 ] Key: PTCVTROWPS: [ 0.00 0.00 ] Key: PTDPBF: [ 0.00 0.00 ] @@ -1471,20 +1468,11 @@ Key: PTILEMOVROWrre: [ 0.00 0.00 ] Key: PTILEMOVROWrreV: [ 0.00 0.00 ] Key: PTILEMOVROWrri: [ 0.00 0.00 ] Key: PTILEMOVROWrriV: [ 0.00 0.00 ] -Key: PTILEPAIRLOAD: [ 0.00 0.00 ] -Key: PTILEPAIRSTORE: [ 0.00 0.00 ] Key: PTILESTORED: [ 0.00 0.00 ] Key: PTILESTOREDV: [ 0.00 0.00 ] Key: PTILEZERO: [ 0.00 0.00 ] Key: PTILEZEROV: [ 0.00 0.00 ] Key: PTMMULTF: [ 0.00 0.00 ] -Key: PTTCMMIMFP: [ 0.00 0.00 ] -Key: PTTCMMRLFP: [ 0.00 0.00 ] -Key: PTTDPBF: [ 0.00 0.00 ] -Key: PTTDPFP: [ 0.00 0.00 ] -Key: PTTMMULTF: [ 0.00 0.00 ] -Key: PTTRANSPOSED: [ 0.00 0.00 ] -Key: PTTRANSPOSEDV: [ 0.00 0.00 ] Key: PTWRITE: [ 0.00 0.00 ] Key: PTWRITEm: [ 0.00 0.00 ] Key: PTWRITEr: [ 0.00 0.00 ] @@ -1717,8 +1705,6 @@ Key: TAILJMPm: [ 0.00 0.00 ] Key: TAILJMPr: [ 0.00 0.00 ] Key: TCMMIMFP: [ 0.00 0.00 ] Key: TCMMRLFP: [ 0.00 0.00 ] -Key: TCONJTCMMIMFP: [ 0.00 0.00 ] -Key: TCONJTFP: [ 0.00 0.00 ] Key: TCRETURN_HIPE: [ 0.00 0.00 ] Key: TCRETURN_WIN: [ 0.00 0.00 ] Key: TCRETURN_WINmi: [ 0.00 0.00 ] @@ -1764,12 +1750,6 @@ Key: TPAUSE: [ 0.00 0.00 ] Key: TRAP: [ 0.00 0.00 ] Key: TST_F: [ 0.00 0.00 ] Key: TST_Fp: [ 0.00 0.00 ] -Key: TTCMMIMFP: [ 0.00 0.00 ] -Key: TTCMMRLFP: [ 0.00 0.00 ] -Key: TTDPBF: [ 0.00 0.00 ] -Key: TTDPFP: [ 0.00 0.00 ] -Key: TTMMULTF: [ 0.00 0.00 ] -Key: TTRANSPOSED: [ 0.00 0.00 ] Key: TZCNT: [ 0.00 0.00 ] Key: TZMSK: [ 0.00 0.00 ] Key: UBSAN_UD: [ 0.00 0.00 ] @@ -7034,7 +7014,6 @@ Key: PhyReg_VR256: [ 0.00 0.00 ] Key: PhyReg_VR512: [ 0.00 0.00 ] Key: PhyReg_VR512_0_15: [ 0.00 0.00 ] Key: PhyReg_TILE: [ 0.00 0.00 ] -Key: PhyReg_TILEPAIR: [ 0.00 0.00 ] Key: VirtReg_GR8: [ 0.00 0.00 ] Key: VirtReg_GRH8: [ 0.00 0.00 ] Key: VirtReg_GR8_NOREX2: [ 0.00 0.00 ] @@ -7170,4 +7149,3 @@ Key: VirtReg_VR256: [ 0.00 0.00 ] Key: VirtReg_VR512: [ 0.00 0.00 ] Key: VirtReg_VR512_0_15: [ 0.00 0.00 ] Key: VirtReg_TILE: [ 0.00 0.00 ] -Key: VirtReg_TILEPAIR: [ 0.00 0.00 ] diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt index c6e5508..bb72886 100644 --- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt +++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt @@ -1439,11 +1439,8 @@ Key: PSUBWrm: [ 0.00 0.00 ] Key: PSUBWrr: [ 0.00 0.00 ] Key: PSWAPDrm: [ 0.00 0.00 ] Key: PSWAPDrr: [ 0.00 0.00 ] -Key: PT: [ 0.00 0.00 ] Key: PTCMMIMFP: [ 0.00 0.00 ] Key: PTCMMRLFP: [ 0.00 0.00 ] -Key: PTCONJTCMMIMFP: [ 0.00 0.00 ] -Key: PTCONJTFP: [ 0.00 0.00 ] Key: PTCVTROWD: [ 0.00 0.00 ] Key: PTCVTROWPS: [ 0.00 0.00 ] Key: PTDPBF: [ 0.00 0.00 ] @@ -1471,20 +1468,11 @@ Key: PTILEMOVROWrre: [ 0.00 0.00 ] Key: PTILEMOVROWrreV: [ 0.00 0.00 ] Key: PTILEMOVROWrri: [ 0.00 0.00 ] Key: PTILEMOVROWrriV: [ 0.00 0.00 ] -Key: PTILEPAIRLOAD: [ 0.00 0.00 ] -Key: PTILEPAIRSTORE: [ 0.00 0.00 ] Key: PTILESTORED: [ 0.00 0.00 ] Key: PTILESTOREDV: [ 0.00 0.00 ] Key: PTILEZERO: [ 0.00 0.00 ] Key: PTILEZEROV: [ 0.00 0.00 ] Key: PTMMULTF: [ 0.00 0.00 ] -Key: PTTCMMIMFP: [ 0.00 0.00 ] -Key: PTTCMMRLFP: [ 0.00 0.00 ] -Key: PTTDPBF: [ 0.00 0.00 ] -Key: PTTDPFP: [ 0.00 0.00 ] -Key: PTTMMULTF: [ 0.00 0.00 ] -Key: PTTRANSPOSED: [ 0.00 0.00 ] -Key: PTTRANSPOSEDV: [ 0.00 0.00 ] Key: PTWRITE: [ 0.00 0.00 ] Key: PTWRITEm: [ 0.00 0.00 ] Key: PTWRITEr: [ 0.00 0.00 ] @@ -1717,8 +1705,6 @@ Key: TAILJMPm: [ 0.00 0.00 ] Key: TAILJMPr: [ 0.00 0.00 ] Key: TCMMIMFP: [ 0.00 0.00 ] Key: TCMMRLFP: [ 0.00 0.00 ] -Key: TCONJTCMMIMFP: [ 0.00 0.00 ] -Key: TCONJTFP: [ 0.00 0.00 ] Key: TCRETURN_HIPE: [ 0.00 0.00 ] Key: TCRETURN_WIN: [ 0.00 0.00 ] Key: TCRETURN_WINmi: [ 0.00 0.00 ] @@ -1764,12 +1750,6 @@ Key: TPAUSE: [ 0.00 0.00 ] Key: TRAP: [ 0.00 0.00 ] Key: TST_F: [ 0.00 0.00 ] Key: TST_Fp: [ 0.00 0.00 ] -Key: TTCMMIMFP: [ 0.00 0.00 ] -Key: TTCMMRLFP: [ 0.00 0.00 ] -Key: TTDPBF: [ 0.00 0.00 ] -Key: TTDPFP: [ 0.00 0.00 ] -Key: TTMMULTF: [ 0.00 0.00 ] -Key: TTRANSPOSED: [ 0.00 0.00 ] Key: TZCNT: [ 0.00 0.00 ] Key: TZMSK: [ 0.00 0.00 ] Key: UBSAN_UD: [ 0.00 0.00 ] @@ -7034,7 +7014,6 @@ Key: PhyReg_VR256: [ 0.00 0.00 ] Key: PhyReg_VR512: [ 0.00 0.00 ] Key: PhyReg_VR512_0_15: [ 0.00 0.00 ] Key: PhyReg_TILE: [ 0.00 0.00 ] -Key: PhyReg_TILEPAIR: [ 0.00 0.00 ] Key: VirtReg_GR8: [ 0.00 0.00 ] Key: VirtReg_GRH8: [ 0.00 0.00 ] Key: VirtReg_GR8_NOREX2: [ 0.00 0.00 ] @@ -7170,4 +7149,3 @@ Key: VirtReg_VR256: [ 0.00 0.00 ] Key: VirtReg_VR512: [ 0.00 0.00 ] Key: VirtReg_VR512_0_15: [ 0.00 0.00 ] Key: VirtReg_TILE: [ 0.00 0.00 ] -Key: VirtReg_TILEPAIR: [ 0.00 0.00 ] diff --git a/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll b/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll index bd8d882..9dd402d 100644 --- a/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll +++ b/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll @@ -26,7 +26,7 @@ ; Also, the first eviction problem is significantly less than 300 instructions. Check ; that there is a zero value. ; Note: we're regex-ing some of the opcodes to avoid test flakyness. -; CHECK: instructions: 20,{{([0-9]{4})}},1{{([0-9]{3})}},2{{([0-9]{3})}},{{.*}},0, +; CHECK: instructions: 20,{{([0-9]{4})}},{{([0-9]{4})}},{{([0-9]{4})}},{{.*}},0, ; Only the candidate virtreg and the 10th LR are included in this problem. Make ; sure the other LRs have values of zero. There are 2700 0s followed by some 1s. ; There's a limit to how many repetitions can be matched. diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll index b5c43fd2..d653895 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll index 57342dc..5de1ac8 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll index 6296d5a..2f5c1ef 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll index e5ae387..a2b2c2f 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll index 7d04ada..e4c48dd 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} target triple = "nvptx64-nvidia-cuda" diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll index b0fe77c..727bb3b 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s ; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} ; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} target triple = "nvptx64-nvidia-cuda" @@ -29,10 +33,10 @@ define void @cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_1]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_1d_param_2]; ; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_1d_param_3]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1}], [%rd2]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_1d_param_5]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1}], [%rd2]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1}], [%rd2], %rs1; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rs1, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -48,10 +52,10 @@ define void @cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_1d_param_1]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_1d_param_2]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_1d_param_3]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_5]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3}], [%r2]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3}], [%r2], %rs1; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rs1, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -79,10 +83,10 @@ define void @cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_2d_param_2]; ; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_2d_param_3]; ; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_2d_param_4]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2}], [%rd2]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_2d_param_6]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2}], [%rd2]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -99,10 +103,10 @@ define void @cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_2d_param_2]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_2d_param_3]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_2d_param_4]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_2d_param_6]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4}], [%r2]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -131,10 +135,10 @@ define void @cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_3d_param_3]; ; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_3d_param_4]; ; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_5]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_3d_param_7]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -152,10 +156,10 @@ define void @cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_3]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_3d_param_4]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_3d_param_5]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -185,10 +189,10 @@ define void @cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_4d_param_4]; ; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_4d_param_5]; ; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_6]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_4d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -207,10 +211,10 @@ define void @cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_4]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_4d_param_5]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_tile_4d_param_6]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -241,10 +245,10 @@ define void @cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_5d_param_5]; ; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_5d_param_6]; ; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_7]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_5d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -264,10 +268,10 @@ define void @cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_5]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_tile_5d_param_6]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_tile_5d_param_7]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -297,10 +301,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2col_3d_param_4]; ; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2col_3d_param_5]; ; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_3d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -319,10 +323,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_3d_param_4]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2col_3d_param_5]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_3d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -354,10 +358,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_4d_param_6]; ; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7]; ; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_4d_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -378,10 +382,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2col_4d_param_6]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_4d_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -415,10 +419,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8]; ; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9]; ; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}; +; CHECK-PTX64-NEXT: ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_5d_param_12]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -441,10 +445,10 @@ define void @cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_5d_param_12]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; diff --git a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll index c3183a1..c50a0fb3 100644 --- a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll +++ b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll @@ -7,11 +7,11 @@ ; CHECK-NEXT: .byte 0 ; CHECK-NEXT: .half 0 ; Num Functions -; CHECK-NEXT: .word 12 +; CHECK-NEXT: .word 13 ; Num LargeConstants -; CHECK-NEXT: .word 2 +; CHECK-NEXT: .word 3 ; Num Callsites -; CHECK-NEXT: .word 16 +; CHECK-NEXT: .word 17 ; Functions and stack size ; CHECK-NEXT: .quad constantargs @@ -38,8 +38,8 @@ ; CHECK-NEXT: .quad liveConstant ; CHECK-NEXT: .quad 0 ; CHECK-NEXT: .quad 1 -; CHECK-NEXT: .quad spilledValue -; CHECK-NEXT: .quad 144 +; CHECK-NEXT: .quad liveArgs +; CHECK-NEXT: .quad 0 ; CHECK-NEXT: .quad 1 ; CHECK-NEXT: .quad directFrameIdx ; CHECK-NEXT: .quad 48 @@ -50,10 +50,14 @@ ; CHECK-NEXT: .quad needsStackRealignment ; CHECK-NEXT: .quad -1 ; CHECK-NEXT: .quad 1 +; CHECK-NEXT: .quad floats +; CHECK-NEXT: .quad 32 +; CHECK-NEXT: .quad 1 ; Num LargeConstants ; CHECK-NEXT: .quad 4294967295 ; CHECK-NEXT: .quad 4294967296 +; CHECK-NEXT: .quad 4609434218613702656 ; Constant arguments ; @@ -278,7 +282,7 @@ define void @liveConstant() { ; ; Verify 28 stack map entries. ; -; CHECK-LABEL: .word .L{{.*}}-spilledValue +; CHECK-LABEL: .word .L{{.*}}-liveArgs ; CHECK-NEXT: .half 0 ; CHECK-NEXT: .half 28 ; @@ -290,7 +294,7 @@ define void @liveConstant() { ; CHECK-NEXT: .half 2 ; CHECK-NEXT: .half 0 ; CHECK-NEXT: .word -define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i8 %l25, i16 zeroext %l26, i32 signext %l27) { +define void @liveArgs(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i8 %l25, i16 zeroext %l26, i32 signext %l27) { entry: call void (i64, i32, ptr, i32, ...) @llvm.experimental.patchpoint.void(i64 11, i32 28, ptr null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i8 %l25, i16 %l26, i32 %l27) ret void @@ -379,6 +383,104 @@ define void @needsStackRealignment() { } declare void @escape_values(...) +; CHECK-LABEL: .word .L{{.*}}-floats +; CHECK-NEXT: .half 0 +; Num Locations +; CHECK-NEXT: .half 12 +; Loc 0: constant float as constant integer +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 1: constant double as large constant integer +; CHECK-NEXT: .byte 5 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 2: constant half as constant integer +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 3: constant bfloat as constant integer +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 4: float value in X register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 10 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 5: double value in X register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 11 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 6: half value in X register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 12 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 7: bfloat value in X register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 13 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 8: float on stack +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 2 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 9: double on stack +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 2 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 10: half on stack +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 2 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +; Loc 11: bfloat on stack +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 2 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +define void @floats(float %f, double %g, half %h, bfloat %i) { + %ff = alloca float + %gg = alloca double + %hh = alloca half + %ii = alloca bfloat + call void (i64, i32, ...) @llvm.experimental.stackmap(i64 888, i32 0, float 1.25, + double 1.5, half 1.5, bfloat 1.5, float %f, double %g, half %h, bfloat %i, ptr %ff, ptr %gg, ptr %hh, ptr %ii) + ret void +} + declare void @llvm.experimental.stackmap(i64, i32, ...) declare void @llvm.experimental.patchpoint.void(i64, i32, ptr, i32, ...) declare i64 @llvm.experimental.patchpoint.i64(i64, i32, ptr, i32, ...) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index 4c35b25..7e6f2c7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -15265,6 +15265,259 @@ define <4 x i32> @masked_gather_widen_sew_negative_stride(ptr %base) { ret <4 x i32> %x } +define <7 x i8> @mgather_baseidx_v7i8(ptr %base, <7 x i8> %idxs, <7 x i1> %m, <7 x i8> %passthru) { +; RV32-LABEL: mgather_baseidx_v7i8: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 127 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.s.x v10, a1 +; RV32-NEXT: vmand.mm v0, v0, v10 +; RV32-NEXT: vsext.vf4 v10, v8 +; RV32-NEXT: vsetvli zero, zero, e8, mf2, ta, mu +; RV32-NEXT: vluxei32.v v9, (a0), v10, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64V-LABEL: mgather_baseidx_v7i8: +; RV64V: # %bb.0: +; RV64V-NEXT: li a1, 127 +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64V-NEXT: vmv.s.x v10, a1 +; RV64V-NEXT: vmand.mm v0, v0, v10 +; RV64V-NEXT: vsext.vf8 v12, v8 +; RV64V-NEXT: vsetvli zero, zero, e8, mf2, ta, mu +; RV64V-NEXT: vluxei64.v v9, (a0), v12, v0.t +; RV64V-NEXT: vmv1r.v v8, v9 +; RV64V-NEXT: ret +; +; RV64ZVE32F-LABEL: mgather_baseidx_v7i8: +; RV64ZVE32F: # %bb.0: +; RV64ZVE32F-NEXT: addi sp, sp, -16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVE32F-NEXT: .cfi_remember_state +; RV64ZVE32F-NEXT: li a1, 64 +; RV64ZVE32F-NEXT: addi a2, sp, 8 +; RV64ZVE32F-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; RV64ZVE32F-NEXT: vsm.v v0, (a2) +; RV64ZVE32F-NEXT: ld a1, 8(sp) +; RV64ZVE32F-NEXT: andi a2, a1, 1 +; RV64ZVE32F-NEXT: beqz a2, .LBB132_2 +; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.v.x v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 3 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 6 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v10, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: .LBB132_2: # %else +; RV64ZVE32F-NEXT: andi a2, a1, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB132_4 +; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 2 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: lbu a3, 0(a3) +; RV64ZVE32F-NEXT: vmv.v.x v10, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 3 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 4 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 6 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v10, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: .LBB132_4: # %else2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB132_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 3 +; RV64ZVE32F-NEXT: vmv.x.s a4, v11 +; RV64ZVE32F-NEXT: vmv.v.x v11, a3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v12 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 4 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a4 +; RV64ZVE32F-NEXT: vmv.x.s a4, v12 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 6 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v11, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: .LBB132_6: # %else5 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB132_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: vmv.x.s a4, v11 +; RV64ZVE32F-NEXT: vmv.v.x v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a4 +; RV64ZVE32F-NEXT: vmv.x.s a4, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 5 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: lbu a3, 0(a3) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 6 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v11, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v10, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: .LBB132_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a2, .LBB132_13 +; RV64ZVE32F-NEXT: # %bb.9: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: bnez a2, .LBB132_14 +; RV64ZVE32F-NEXT: .LBB132_10: # %else14 +; RV64ZVE32F-NEXT: andi a1, a1, 64 +; RV64ZVE32F-NEXT: beqz a1, .LBB132_12 +; RV64ZVE32F-NEXT: .LBB132_11: # %cond.load16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.v.x v8, a1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: add a0, a0, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 3 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a1 +; RV64ZVE32F-NEXT: vmv.x.s a1, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 5 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: lbu a0, 0(a0) +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a1 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a1 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: .LBB132_12: # %else17 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vmv1r.v v8, v9 +; RV64ZVE32F-NEXT: addi sp, sp, 16 +; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 0 +; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB132_13: # %cond.load10 +; RV64ZVE32F-NEXT: .cfi_restore_state +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 2 +; RV64ZVE32F-NEXT: vmv.x.s a4, v10 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vmv.v.x v10, a3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 3 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a4 +; RV64ZVE32F-NEXT: vmv.x.s a4, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 6 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v10, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB132_10 +; RV64ZVE32F-NEXT: .LBB132_14: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: vmv.x.s a4, v11 +; RV64ZVE32F-NEXT: vmv.v.x v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 3 +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a4 +; RV64ZVE32F-NEXT: vmv.x.s a4, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 6 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: lbu a3, 0(a3) +; RV64ZVE32F-NEXT: vslide1down.vx v11, v11, a2 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v11, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v10, a3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v10, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: andi a1, a1, 64 +; RV64ZVE32F-NEXT: bnez a1, .LBB132_11 +; RV64ZVE32F-NEXT: j .LBB132_12 + %ptrs = getelementptr inbounds i8, ptr %base, <7 x i8> %idxs + %v = call <7 x i8> @llvm.masked.gather.v7i8.v7p0(<7 x ptr> %ptrs, i32 1, <7 x i1> %m, <7 x i8> %passthru) + ret <7 x i8> %v +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; RV32V-ZVFH: {{.*}} ; RV32V-ZVFHMIN: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/pr165232.ll b/llvm/test/CodeGen/RISCV/rvv/pr165232.ll new file mode 100644 index 0000000..bef53c6 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/pr165232.ll @@ -0,0 +1,244 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s + +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-linux-gnu" + +define i1 @main(ptr %var_117, ptr %arrayinit.element3045, ptr %arrayinit.element3047, ptr %arrayinit.element3049, ptr %arrayinit.element3051, ptr %arrayinit.element3053, ptr %arrayinit.element3055, ptr %arrayinit.element3057, ptr %arrayinit.element3059, ptr %arrayinit.element3061, ptr %arrayinit.element3063, ptr %arrayinit.element3065, ptr %arrayinit.element3067, i64 %var_94_i.07698, target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %1) { +; CHECK-LABEL: main: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr t0, vlenb +; CHECK-NEXT: slli t0, t0, 3 +; CHECK-NEXT: mv t1, t0 +; CHECK-NEXT: slli t0, t0, 1 +; CHECK-NEXT: add t0, t0, t1 +; CHECK-NEXT: sub sp, sp, t0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; CHECK-NEXT: sd a1, 8(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd a2, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs4r.v v12, (a1) # vscale x 32-byte Folded Spill +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 2 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: vs4r.v v16, (a1) # vscale x 32-byte Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: ld t0, 56(a1) +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: ld t1, 48(a1) +; CHECK-NEXT: vsetvli t2, zero, e8, m1, ta, ma +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: ld t2, 40(a1) +; CHECK-NEXT: # kill: def $v10 killed $v9 killed $vtype +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: ld t3, 32(a1) +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: ld t4, 16(a1) +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: ld t5, 24(a1) +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vsetvli t6, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.i v22, 0 +; CHECK-NEXT: vmv1r.v v14, v9 +; CHECK-NEXT: sd zero, 0(a0) +; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vmv1r.v v15, v9 +; CHECK-NEXT: vmv1r.v v18, v9 +; CHECK-NEXT: li t6, 1023 +; CHECK-NEXT: vmv.v.i v26, 0 +; CHECK-NEXT: vmv1r.v v19, v9 +; CHECK-NEXT: slli t6, t6, 52 +; CHECK-NEXT: vmv.v.i v28, 0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs2r.v v22, (a1) # vscale x 16-byte Folded Spill +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: vs4r.v v24, (a1) # vscale x 32-byte Folded Spill +; CHECK-NEXT: slli a2, a2, 1 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: ld a2, 0(sp) # 8-byte Folded Reload +; CHECK-NEXT: vs2r.v v28, (a1) # vscale x 16-byte Folded Spill +; CHECK-NEXT: ld a1, 8(sp) # 8-byte Folded Reload +; CHECK-NEXT: vmv1r.v v20, v9 +; CHECK-NEXT: sd t6, 0(t5) +; CHECK-NEXT: vmv2r.v v16, v14 +; CHECK-NEXT: vmv2r.v v14, v12 +; CHECK-NEXT: vmv2r.v v12, v10 +; CHECK-NEXT: vmv1r.v v11, v9 +; CHECK-NEXT: vmv1r.v v21, v9 +; CHECK-NEXT: csrr t5, vlenb +; CHECK-NEXT: slli t5, t5, 3 +; CHECK-NEXT: add t5, sp, t5 +; CHECK-NEXT: addi t5, t5, 16 +; CHECK-NEXT: vs2r.v v18, (t5) # vscale x 16-byte Folded Spill +; CHECK-NEXT: csrr t6, vlenb +; CHECK-NEXT: slli t6, t6, 1 +; CHECK-NEXT: add t5, t5, t6 +; CHECK-NEXT: vs2r.v v20, (t5) # vscale x 16-byte Folded Spill +; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v19, 0 +; CHECK-NEXT: vmclr.m v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vmv.v.i v6, 0 +; CHECK-NEXT: .LBB0_1: # %for.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmv1r.v v20, v19 +; CHECK-NEXT: vmv1r.v v3, v19 +; CHECK-NEXT: vmv1r.v v5, v19 +; CHECK-NEXT: vmv1r.v v2, v19 +; CHECK-NEXT: vmv1r.v v31, v19 +; CHECK-NEXT: vmv1r.v v30, v19 +; CHECK-NEXT: vmv1r.v v4, v19 +; CHECK-NEXT: vmv2r.v v22, v10 +; CHECK-NEXT: vmv4r.v v24, v12 +; CHECK-NEXT: vmv2r.v v28, v16 +; CHECK-NEXT: vmv2r.v v8, v6 +; CHECK-NEXT: vmv1r.v v18, v19 +; CHECK-NEXT: vmv1r.v v21, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vle32.v v20, (t4) +; CHECK-NEXT: vle32.v v3, (t1) +; CHECK-NEXT: vle32.v v30, (a7) +; CHECK-NEXT: vle64.v v8, (a4) +; CHECK-NEXT: vle32.v v5, (t2) +; CHECK-NEXT: vle32.v v2, (t3) +; CHECK-NEXT: vle32.v v31, (a6) +; CHECK-NEXT: vmv1r.v v24, v30 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; CHECK-NEXT: vmflt.vv v21, v8, v6, v0.t +; CHECK-NEXT: vmv1r.v v8, v19 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, mu +; CHECK-NEXT: vle32.v v18, (a2) +; CHECK-NEXT: vle32.v v8, (a3) +; CHECK-NEXT: vle32.v v4, (a5) +; CHECK-NEXT: vmv1r.v v22, v20 +; CHECK-NEXT: csrr t5, vlenb +; CHECK-NEXT: slli t5, t5, 3 +; CHECK-NEXT: add t5, sp, t5 +; CHECK-NEXT: addi t5, t5, 16 +; CHECK-NEXT: vl1r.v v1, (t5) # vscale x 8-byte Folded Reload +; CHECK-NEXT: csrr t6, vlenb +; CHECK-NEXT: add t5, t5, t6 +; CHECK-NEXT: vl2r.v v2, (t5) # vscale x 16-byte Folded Reload +; CHECK-NEXT: slli t6, t6, 1 +; CHECK-NEXT: add t5, t5, t6 +; CHECK-NEXT: vl1r.v v4, (t5) # vscale x 8-byte Folded Reload +; CHECK-NEXT: vsseg4e32.v v1, (zero) +; CHECK-NEXT: vsseg8e32.v v22, (a1) +; CHECK-NEXT: vmv1r.v v0, v21 +; CHECK-NEXT: vssub.vv v8, v19, v18, v0.t +; CHECK-NEXT: csrr t5, vlenb +; CHECK-NEXT: slli t5, t5, 2 +; CHECK-NEXT: mv t6, t5 +; CHECK-NEXT: slli t5, t5, 1 +; CHECK-NEXT: add t5, t5, t6 +; CHECK-NEXT: add t5, sp, t5 +; CHECK-NEXT: addi t5, t5, 16 +; CHECK-NEXT: vl4r.v v20, (t5) # vscale x 32-byte Folded Reload +; CHECK-NEXT: vsetvli zero, t0, e64, m2, ta, ma +; CHECK-NEXT: vsseg2e64.v v20, (zero) +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: addi t5, sp, 16 +; CHECK-NEXT: vl4r.v v20, (t5) # vscale x 32-byte Folded Reload +; CHECK-NEXT: csrr t6, vlenb +; CHECK-NEXT: slli t6, t6, 2 +; CHECK-NEXT: add t5, t5, t6 +; CHECK-NEXT: vl4r.v v24, (t5) # vscale x 32-byte Folded Reload +; CHECK-NEXT: vsetivli zero, 0, e64, m2, ta, ma +; CHECK-NEXT: vsseg4e64.v v20, (zero), v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vsseg8e32.v v8, (a0) +; CHECK-NEXT: csrr t5, vlenb +; CHECK-NEXT: slli t5, t5, 4 +; CHECK-NEXT: add t5, sp, t5 +; CHECK-NEXT: addi t5, t5, 16 +; CHECK-NEXT: vl4r.v v20, (t5) # vscale x 32-byte Folded Reload +; CHECK-NEXT: csrr t6, vlenb +; CHECK-NEXT: slli t6, t6, 2 +; CHECK-NEXT: add t5, t5, t6 +; CHECK-NEXT: vl4r.v v24, (t5) # vscale x 32-byte Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vsseg4e64.v v20, (zero) +; CHECK-NEXT: j .LBB0_1 +entry: + store double 0.000000e+00, ptr %var_117, align 8 + store double 1.000000e+00, ptr %arrayinit.element3061, align 8 + br label %for.body + +for.body: ; preds = %for.body, %entry + %2 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3059, i64 0) + %3 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3067, i64 0) + %4 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3065, i64 0) + %5 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3063, i64 0) + %6 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3055, i64 0) + %7 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3057, i64 0) + %8 = call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.p0.i64(<vscale x 2 x float> zeroinitializer, ptr %arrayinit.element3053, i64 0) + %9 = call <vscale x 2 x double> @llvm.riscv.vle.nxv2f64.p0.i64(<vscale x 2 x double> zeroinitializer, ptr %arrayinit.element3051, i64 0) + %10 = tail call <vscale x 2 x i32> @llvm.riscv.vle.nxv2i32.p0.i64(<vscale x 2 x i32> zeroinitializer, ptr %arrayinit.element3047, i64 0) + %11 = tail call <vscale x 2 x i32> @llvm.riscv.vle.nxv2i32.p0.i64(<vscale x 2 x i32> zeroinitializer, ptr %arrayinit.element3049, i64 0) + call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv8i8_4t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) zeroinitializer, ptr null, i64 0, i64 5) + %12 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) zeroinitializer, <vscale x 2 x float> %8, i32 0) + %13 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %12, <vscale x 2 x float> %7, i32 2) + %14 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %13, <vscale x 2 x float> %6, i32 0) + %15 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %14, <vscale x 2 x float> %5, i32 0) + %16 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %15, <vscale x 2 x float> %4, i32 0) + %17 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %16, <vscale x 2 x float> %3, i32 0) + %18 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2f32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %17, <vscale x 2 x float> %2, i32 0) + call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv8i8_8t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %18, ptr %arrayinit.element3045, i64 0, i64 5) + %19 = tail call <vscale x 2 x i1> @llvm.riscv.vmfgt.mask.nxv2f64.nxv2f64.i64(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> %9, <vscale x 2 x i1> zeroinitializer, i64 0) + %20 = tail call <vscale x 2 x i32> @llvm.riscv.vssub.mask.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> %11, <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> %10, <vscale x 2 x i1> %19, i64 0, i64 0) + call void @llvm.riscv.vsseg2.triscv.vector.tuple_nxv16i8_2t.p0.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, ptr null, i64 %var_94_i.07698, i64 6) + call void @llvm.riscv.vsseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) zeroinitializer, ptr null, <vscale x 2 x i1> zeroinitializer, i64 0, i64 6) + %21 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv2i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, <vscale x 2 x i32> %20, i32 0) + call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv8i8_8t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %21, ptr %var_117, i64 0, i64 5) + call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv16i8_4t.p0.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %1, ptr null, i64 0, i64 6) + br label %for.body +} diff --git a/llvm/test/CodeGen/RISCV/rvv/zvlsseg-spill.mir b/llvm/test/CodeGen/RISCV/rvv/zvlsseg-spill.mir index dd9960d..9c2fa9d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/zvlsseg-spill.mir +++ b/llvm/test/CodeGen/RISCV/rvv/zvlsseg-spill.mir @@ -32,10 +32,10 @@ body: | ; CHECK-NEXT: $x11 = ADDI $x2, 16 ; CHECK-NEXT: VS4R_V $v0m4, $x11, implicit $v0_v1_v2_v3_v4_v5_v6 :: (store (<vscale x 1 x s256>) into %stack.0, align 8) ; CHECK-NEXT: $x12 = PseudoReadVLENB - ; CHECK-NEXT: $x13 = SLLI $x12, 2 - ; CHECK-NEXT: $x11 = ADD killed $x11, killed $x13 + ; CHECK-NEXT: $x12 = SLLI killed $x12, 2 + ; CHECK-NEXT: $x11 = ADD killed $x11, $x12 ; CHECK-NEXT: VS2R_V $v4m2, $x11, implicit $v0_v1_v2_v3_v4_v5_v6 :: (store (<vscale x 1 x s128>) into %stack.0, align 8) - ; CHECK-NEXT: $x12 = SLLI killed $x12, 1 + ; CHECK-NEXT: $x12 = SRLI killed $x12, 1 ; CHECK-NEXT: $x11 = ADD killed $x11, killed $x12 ; CHECK-NEXT: VS1R_V $v6, killed $x11, implicit $v0_v1_v2_v3_v4_v5_v6 :: (store (<vscale x 1 x s64>) into %stack.0) ; CHECK-NEXT: $x11 = ADDI $x2, 16 @@ -93,10 +93,10 @@ body: | ; CHECK-NEXT: $x11 = ADDI $x2, 16 ; CHECK-NEXT: $v10m2 = VL2RE8_V $x11 :: (load (<vscale x 1 x s128>) from %stack.0, align 8) ; CHECK-NEXT: $x12 = PseudoReadVLENB - ; CHECK-NEXT: $x13 = SLLI $x12, 1 - ; CHECK-NEXT: $x11 = ADD killed $x11, killed $x13 + ; CHECK-NEXT: $x12 = SLLI killed $x12, 1 + ; CHECK-NEXT: $x11 = ADD killed $x11, $x12 ; CHECK-NEXT: $v12m4 = VL4RE8_V $x11 :: (load (<vscale x 1 x s256>) from %stack.0, align 8) - ; CHECK-NEXT: $x12 = SLLI killed $x12, 2 + ; CHECK-NEXT: $x12 = SLLI killed $x12, 1 ; CHECK-NEXT: $x11 = ADD killed $x11, killed $x12 ; CHECK-NEXT: $v16 = VL1RE8_V killed $x11 :: (load (<vscale x 1 x s64>) from %stack.0) ; CHECK-NEXT: VS1R_V killed $v10, killed renamable $x10 diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/issue-146942-ptr-cast.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/issue-146942-ptr-cast.ll index ed67344..4817e74 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-resources/issue-146942-ptr-cast.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/issue-146942-ptr-cast.ll @@ -16,7 +16,6 @@ define void @case1() local_unnamed_addr { ; CHECK: %[[#BUFFER_LOAD:]] = OpLoad %[[#FLOAT4]] %{{[0-9]+}} Aligned 16 ; CHECK: %[[#CAST_LOAD:]] = OpBitcast %[[#INT4]] %[[#BUFFER_LOAD]] - ; CHECK: %[[#VEC_SHUFFLE:]] = OpVectorShuffle %[[#INT4]] %[[#CAST_LOAD]] %[[#CAST_LOAD]] 0 1 2 3 %1 = tail call target("spirv.VulkanBuffer", [0 x <4 x float>], 12, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v4f32_12_0t(i32 0, i32 2, i32 1, i32 0, ptr nonnull @.str) %2 = tail call target("spirv.VulkanBuffer", [0 x <4 x i32>], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v4i32_12_1t(i32 0, i32 5, i32 1, i32 0, ptr nonnull @.str.2) %3 = tail call noundef align 16 dereferenceable(16) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0v4f32_12_0t(target("spirv.VulkanBuffer", [0 x <4 x float>], 12, 0) %1, i32 0) @@ -29,8 +28,7 @@ define void @case1() local_unnamed_addr { define void @case2() local_unnamed_addr { ; CHECK: %[[#BUFFER_LOAD:]] = OpLoad %[[#FLOAT4]] %{{[0-9]+}} Aligned 16 ; CHECK: %[[#CAST_LOAD:]] = OpBitcast %[[#INT4]] %[[#BUFFER_LOAD]] - ; CHECK: %[[#VEC_SHUFFLE:]] = OpVectorShuffle %[[#INT4]] %[[#CAST_LOAD]] %[[#CAST_LOAD]] 0 1 2 3 - ; CHECK: %[[#VEC_TRUNCATE:]] = OpVectorShuffle %[[#INT3]] %[[#VEC_SHUFFLE]] %[[#UNDEF_INT4]] 0 1 2 + ; CHECK: %[[#VEC_TRUNCATE:]] = OpVectorShuffle %[[#INT3]] %[[#CAST_LOAD]] %[[#UNDEF_INT4]] 0 1 2 %1 = tail call target("spirv.VulkanBuffer", [0 x <4 x float>], 12, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v4f32_12_0t(i32 0, i32 2, i32 1, i32 0, ptr nonnull @.str) %2 = tail call target("spirv.VulkanBuffer", [0 x <3 x i32>], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v3i32_12_1t(i32 0, i32 5, i32 1, i32 0, ptr nonnull @.str.3) %3 = tail call noundef align 16 dereferenceable(16) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0v4f32_12_0t(target("spirv.VulkanBuffer", [0 x <4 x float>], 12, 0) %1, i32 0) diff --git a/llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll b/llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll index 8491328..a1ec2cd 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll @@ -26,3 +26,25 @@ entry: store <4 x i32> %6, ptr addrspace(11) %7, align 16 ret void } + +; This tests a load from a pointer that has been bitcast between vector types +; which share the same total bit-width but have different numbers of elements. +; Tests that legalize-pointer-casts works correctly by moving the bitcast to +; the element that was loaded. + +define void @main2() local_unnamed_addr #0 { +entry: +; CHECK: %[[LOAD:[0-9]+]] = OpLoad %[[#v2_double]] {{.*}} +; CHECK: %[[BITCAST1:[0-9]+]] = OpBitcast %[[#v4_uint]] %[[LOAD]] +; CHECK: %[[BITCAST2:[0-9]+]] = OpBitcast %[[#v2_double]] %[[BITCAST1]] +; CHECK: OpStore {{%[0-9]+}} %[[BITCAST2]] {{.*}} + + %0 = tail call target("spirv.VulkanBuffer", [0 x <2 x double>], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v2f64_12_1t(i32 0, i32 2, i32 1, i32 0, ptr nonnull @.str.2) + %2 = tail call noundef align 16 dereferenceable(16) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0v2f64_12_1t(target("spirv.VulkanBuffer", [0 x <2 x double>], 12, 1) %0, i32 0) + %3 = load <4 x i32>, ptr addrspace(11) %2 + %4 = tail call noundef align 16 dereferenceable(16) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0v2f64_12_1t(target("spirv.VulkanBuffer", [0 x <2 x double>], 12, 1) %0, i32 1) + store <4 x i32> %3, ptr addrspace(11) %4 + ret void +} + +attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } diff --git a/llvm/test/CodeGen/SystemZ/stackmap.ll b/llvm/test/CodeGen/SystemZ/stackmap.ll index 05b8de7..f414ea3 100644 --- a/llvm/test/CodeGen/SystemZ/stackmap.ll +++ b/llvm/test/CodeGen/SystemZ/stackmap.ll @@ -84,14 +84,14 @@ ; CHECK-NEXT: .short 8 ; CHECK-NEXT: .short 0 ; CHECK-NEXT: .short 0 -; CHECK-NEXT: .long 65535 +; CHECK-NEXT: .long -1 ; SmallConstant ; CHECK-NEXT: .byte 4 ; CHECK-NEXT: .byte 0 ; CHECK-NEXT: .short 8 ; CHECK-NEXT: .short 0 ; CHECK-NEXT: .short 0 -; CHECK-NEXT: .long 65535 +; CHECK-NEXT: .long -1 ; SmallConstant ; CHECK-NEXT: .byte 4 ; CHECK-NEXT: .byte 0 diff --git a/llvm/test/CodeGen/X86/amx-tf32-internal.ll b/llvm/test/CodeGen/X86/amx-tf32-internal.ll index 6d0f3c5..caf7a1c 100644 --- a/llvm/test/CodeGen/X86/amx-tf32-internal.ll +++ b/llvm/test/CodeGen/X86/amx-tf32-internal.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+avx512f, \ -; RUN: -mattr=+amx-tf32,+amx-transpose -verify-machineinstrs | FileCheck %s +; RUN: -mattr=+amx-tf32 -verify-machineinstrs | FileCheck %s define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { ; CHECK-LABEL: test_amx: @@ -20,7 +20,6 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { ; CHECK-NEXT: tilezero %tmm1 ; CHECK-NEXT: tilezero %tmm2 ; CHECK-NEXT: tmmultf32ps %tmm1, %tmm0, %tmm2 -; CHECK-NEXT: ttmmultf32ps %tmm1, %tmm0, %tmm2 ; CHECK-NEXT: tilestored %tmm2, (%rdi,%rdx) ; CHECK-NEXT: tilerelease ; CHECK-NEXT: vzeroupper @@ -31,9 +30,8 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) %c1 = call x86_amx @llvm.x86.tmmultf32ps.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b) - %c2 = call x86_amx @llvm.x86.ttmmultf32ps.internal(i16 8, i16 8, i16 8, x86_amx %c1, x86_amx %a, x86_amx %b) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c2) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c1) ret void } @@ -43,4 +41,3 @@ declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) declare x86_amx @llvm.x86.tmmultf32ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.ttmmultf32ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) diff --git a/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll b/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll index af1a7ae..642c1b7 100644 --- a/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll +++ b/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-tf32,+amx-transpose -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-tf32 -verify-machineinstrs | FileCheck %s define void @test_tmmultf32ps() { ; CHECK-LABEL: test_tmmultf32ps: @@ -11,13 +11,3 @@ define void @test_tmmultf32ps() { } declare void @llvm.x86.tmmultf32ps(i8 %A, i8 %B, i8 %C) -define void @test_ttmmultf32ps() { -; CHECK-LABEL: test_ttmmultf32ps: -; CHECK: # %bb.0: -; CHECK-NEXT: ttmmultf32ps %tmm3, %tmm2, %tmm1 -; CHECK-NEXT: retq - call void @llvm.x86.ttmmultf32ps(i8 1, i8 2, i8 3) - ret void -} -declare void @llvm.x86.ttmmultf32ps(i8 %A, i8 %B, i8 %C) - diff --git a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll deleted file mode 100755 index 1f5758c..0000000 --- a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll +++ /dev/null @@ -1,122 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O0 -; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O2 -; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR - -define void @test_amx(i64 %stride, i8* %addr1) #0 { -; CHECK-LABEL: test_amx: -; CHECK: # %bb.0: -; CHECK-NEXT: t2rpntlvwz0rs (%rsi,%rdi), %tmm0 -; CHECK-NEXT: t2rpntlvwz0rst1 (%rsi,%rdi), %tmm2 -; CHECK-NEXT: t2rpntlvwz1rs (%rsi,%rdi), %tmm0 -; CHECK-NEXT: t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2 -; CHECK-NEXT: retq -; -; EGPR-LABEL: test_amx: -; EGPR: # %bb.0: -; EGPR-NEXT: t2rpntlvwz0rs (%rsi,%rdi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf8,0x04,0x3e] -; EGPR-NEXT: t2rpntlvwz0rst1 (%rsi,%rdi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x3e] -; EGPR-NEXT: t2rpntlvwz1rs (%rsi,%rdi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf8,0x04,0x3e] -; EGPR-NEXT: t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x3e] -; EGPR-NEXT: retq # encoding: [0xc3] - call void @llvm.x86.t2rpntlvwz0rs(i8 1, i8* %addr1, i64 %stride) - call void @llvm.x86.t2rpntlvwz0rst1(i8 2, i8* %addr1, i64 %stride) - call void @llvm.x86.t2rpntlvwz1rs(i8 1, i8* %addr1, i64 %stride) - call void @llvm.x86.t2rpntlvwz1rst1(i8 2, i8* %addr1, i64 %stride) - ret void -} -declare void @llvm.x86.t2rpntlvwz0rs(i8 , i8* , i64 ) -declare void @llvm.x86.t2rpntlvwz0rst1(i8 , i8* , i64 ) -declare void @llvm.x86.t2rpntlvwz1rs(i8 , i8* , i64 ) -declare void @llvm.x86.t2rpntlvwz1rst1(i8 , i8* , i64 ) - -define void @test_amx2(i8* %base, i64 %stride) #0 { -; O0-LABEL: test_amx2: -; O0: # %bb.0: -; O0-NEXT: xorps %xmm0, %xmm0 -; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O0-NEXT: movb $1, -{{[0-9]+}}(%rsp) -; O0-NEXT: movw $8, %ax -; O0-NEXT: # implicit-def: $al -; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; O0-NEXT: # implicit-def: $al -; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) -; O0-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4 -; O0-NEXT: movw $8, %ax -; O0-NEXT: # implicit-def: $al -; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; O0-NEXT: # implicit-def: $al -; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) -; O0-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4 -; O0-NEXT: movw $8, %ax -; O0-NEXT: # implicit-def: $al -; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; O0-NEXT: # implicit-def: $al -; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) -; O0-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 -; O0-NEXT: tilerelease -; O0-NEXT: retq -; -; O2-LABEL: test_amx2: -; O2: # %bb.0: -; O2-NEXT: xorps %xmm0, %xmm0 -; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O2-NEXT: movb $1, -{{[0-9]+}}(%rsp) -; O2-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; O2-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; O2-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; O2-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; O2-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) -; O2-NEXT: movw $8, %ax -; O2-NEXT: t2rpntlvwz0rs (%rdi,%rsi), %tmm4 -; O2-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4 -; O2-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4 -; O2-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 -; O2-NEXT: tilerelease -; O2-NEXT: retq -; -; EGPR-LABEL: test_amx2: -; EGPR: # %bb.0: -; EGPR-NEXT: xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0] -; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xc0] -; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xd0] -; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xe0] -; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xf0] -; EGPR-NEXT: movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xc0,0x01] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf4,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd8,0x08,0x00] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf5,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xda,0x08,0x00] -; EGPR-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0xc0] -; EGPR-NEXT: movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00] -; EGPR-NEXT: t2rpntlvwz0rs (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf8,0x24,0x37] -; EGPR-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf9,0x24,0x37] -; EGPR-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf8,0x24,0x37] -; EGPR-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf9,0x24,0x37] -; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] -; EGPR-NEXT: retq # encoding: [0xc3] - call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - ret void -} -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16, i16, i16, i8*, i64) -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16, i16, i16, i8*, i64) -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16, i16, i16, i8*, i64) -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16, i16, i16, i8*, i64) diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll b/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll deleted file mode 100644 index 4f41410..0000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll +++ /dev/null @@ -1,136 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ -; RUN: -mattr=+amx-transpose -verify-machineinstrs | FileCheck %s - -@buf = dso_local global [2048 x i8] zeroinitializer, align 16 -@buf2 = dso_local global [2048 x i8] zeroinitializer, align 16 - -define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 { -; CHECK-LABEL: test_tile_2rpntlvwz0: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movq %rsp, %rbp -; CHECK-NEXT: .cfi_def_cfa_register %rbp -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00 -; CHECK-NEXT: subq $8192, %rsp # imm = 0x2000 -; CHECK-NEXT: .cfi_offset %rbx, -24 -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # kill: def $dx killed $dx killed $edx -; CHECK-NEXT: movw %si, %cx -; CHECK-NEXT: movw %di, %ax -; CHECK-NEXT: # implicit-def: $al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # implicit-def: $al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # implicit-def: $al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # implicit-def: $cl -; CHECK-NEXT: movb %cl, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # implicit-def: $al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # implicit-def: $al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # implicit-def: $al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # implicit-def: $al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; CHECK-NEXT: movl $buf, %esi -; CHECK-NEXT: movl $32, %edi -; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdi), %tmm4 -; CHECK-NEXT: movabsq $64, %rbx -; CHECK-NEXT: tilestored %tmm5, (%rsp,%rbx) # 1024-byte Folded Spill -; CHECK-NEXT: tileloadd (%rsp,%rbx), %tmm0 # 1024-byte Folded Reload -; CHECK-NEXT: movabsq $64, %rbx -; CHECK-NEXT: tilestored %tmm4, 1024(%rsp,%rbx) # 1024-byte Folded Spill -; CHECK-NEXT: tileloadd 1024(%rsp,%rbx), %tmm1 # 1024-byte Folded Reload -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tilestored %tmm1, (%rsi,%rdi) -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi) -; CHECK-NEXT: tilezero %tmm0 -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi) -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm1 -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm2 -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm0 -; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi) -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm0 -; CHECK-NEXT: movl $buf2, %edx -; CHECK-NEXT: movl $32, %esi -; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi) -; CHECK-NEXT: leaq -8(%rbp), %rsp -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: .cfi_def_cfa %rsp, 8 -; CHECK-NEXT: tilerelease -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -entry: - %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr @buf, i64 32) #3 - %1 = extractvalue { x86_amx, x86_amx } %0, 0 - %2 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #3 - %3 = extractvalue { x86_amx, x86_amx } %0, 1 - %4 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #3 - %5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #3 - %6 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #3 - %7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6) #3 - %8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %2) #3 - %9 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4) #3 - %10 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %7, x86_amx %8, x86_amx %9) #3 - %11 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) #3 - %12 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #3 - tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, ptr @buf2, i64 32, x86_amx %12) #3 - ret void -} - -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #1 - -declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2 - -declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3 - -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3 - -declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2 - -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #4 - -attributes #0 = { nounwind uwtable "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose" } -attributes #1 = { argmemonly nofree nounwind readonly } -attributes #2 = { nofree nosync nounwind readnone } -attributes #3 = { nounwind } -attributes #4 = { argmemonly nounwind writeonly } - -!llvm.module.flags = !{!0, !1, !2} - -!0 = !{i32 1, !"wchar_size", i32 4} -!1 = !{i32 7, !"uwtable", i32 2} -!2 = !{i32 7, !"frame-pointer", i32 2} diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir deleted file mode 100644 index ab12ab3..0000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir +++ /dev/null @@ -1,165 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ -# RUN: -mattr=+amx-transpose -run-pass=fasttileconfig -o - %s | FileCheck %s - ---- -name: test_tile_2rpntlvwz0 -alignment: 16 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -failedISel: false -tracksRegLiveness: true -hasWinCFI: false -callsEHReturn: false -callsUnwindInit: false -hasEHContTarget: false -hasEHScopes: false -hasEHFunclets: false -failsVerification: false -tracksDebugUserValues: false -registers: [] -liveins: - - { reg: '$edi', virtual-reg: '' } - - { reg: '$esi', virtual-reg: '' } - - { reg: '$edx', virtual-reg: '' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 1024 - adjustsStack: false - hasCalls: true - stackProtector: '' - functionContext: '' - maxCallFrameSize: 4294967295 - cvBytesOfCalleeSavedRegisters: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false - hasTailCall: false - localFrameSize: 0 - savePoint: [] - restorePoint: [] -fixedStack: [] -stack: - - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 1, name: '', type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 2, name: '', type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 3, name: '', type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 4, name: '', type: default, offset: 0, size: 64, alignment: 4, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 5, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 6, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 7, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -callSites: [] -debugValueSubstitutions: [] -constants: [] -machineFunctionInfo: - amxProgModel: ManagedRA -body: | - bb.0.entry: - liveins: $rdi, $rsi, $rdx, $rax - - ; CHECK-LABEL: name: test_tile_2rpntlvwz0 - ; CHECK: liveins: $rdi, $rsi, $rdx, $rax - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $zmm0 = AVX512_512_SET0 - ; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store (s512) into %stack.4, align 4) - ; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4) - ; CHECK-NEXT: renamable $rcx = MOV32ri64 64 - ; CHECK-NEXT: MOV64mr %stack.7, 1, $noreg, 0, $noreg, $rcx :: (store (s64) into %stack.7) - ; CHECK-NEXT: renamable $cx = MOV16ri 64 - ; CHECK-NEXT: MOV16mr %stack.5, 1, $noreg, 0, $noreg, $cx :: (store (s16) into %stack.5) - ; CHECK-NEXT: renamable $cx = MOV16ri 16 - ; CHECK-NEXT: renamable $r8w = MOV16ri 16 - ; CHECK-NEXT: MOV16mr %stack.6, 1, $noreg, 0, $noreg, $r8w :: (store (s16) into %stack.6) - ; CHECK-NEXT: $al = IMPLICIT_DEF - ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4) - ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4) - ; CHECK-NEXT: $al = IMPLICIT_DEF - ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 50, $noreg, $al :: (store (s512) into %stack.4 + 50, align 2, basealign 4) - ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 20, $noreg, $cx :: (store (s512) into %stack.4 + 20, align 4) - ; CHECK-NEXT: $al = IMPLICIT_DEF - ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 49, $noreg, $al :: (store (s512) into %stack.4 + 49, align 1, basealign 4) - ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 18, $noreg, $di :: (store (s512) into %stack.4 + 18, align 2, basealign 4) - ; CHECK-NEXT: $al = IMPLICIT_DEF - ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4) - ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4) - ; CHECK-NEXT: $al = IMPLICIT_DEF - ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4) - ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4) - ; CHECK-NEXT: $al = IMPLICIT_DEF - ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 52, $noreg, $al :: (store (s512) into %stack.4 + 52, align 4) - ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 24, $noreg, $cx :: (store (s512) into %stack.4 + 24, align 4) - ; CHECK-NEXT: $al = IMPLICIT_DEF - ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 53, $noreg, $al :: (store (s512) into %stack.4 + 53, align 1, basealign 4) - ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 26, $noreg, $di :: (store (s512) into %stack.4 + 26, align 2, basealign 4) - ; CHECK-NEXT: PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.4, align 4) - ; CHECK-NEXT: renamable $r9 = COPY $rsi - ; CHECK-NEXT: $rsi = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7) - ; CHECK-NEXT: renamable $r8 = COPY $rdi - ; CHECK-NEXT: $di = MOV16rm %stack.6, 1, $noreg, 0, $noreg :: (load (s16) from %stack.6) - ; CHECK-NEXT: renamable $r10 = COPY $rax - ; CHECK-NEXT: $ax = MOV16rm %stack.5, 1, $noreg, 0, $noreg :: (load (s16) from %stack.5) - ; CHECK-NEXT: renamable $tmm4_tmm5 = PT2RPNTLVWZ0V renamable $ax, renamable $cx, renamable $di, renamable $rdx, 1, killed renamable $r10, 0, $noreg - ; CHECK-NEXT: renamable $tmm0 = COPY renamable $tmm5 - ; CHECK-NEXT: renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5 - ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, renamable $r9, 1, renamable $rsi, 0, $noreg, killed renamable $tmm1 - ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $di, renamable $r8, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0 - ; CHECK-NEXT: renamable $tmm0 = PTILEZEROV renamable $ax, renamable $cx - ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0 - ; CHECK-NEXT: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg - ; CHECK-NEXT: renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r8, 1, renamable $rsi, 0, $noreg - ; CHECK-NEXT: renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg - ; CHECK-NEXT: renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2 - ; CHECK-NEXT: PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0 - renamable $zmm0 = AVX512_512_SET0 - VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store (s512) into %stack.4, align 4) - MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4) - renamable $rcx = MOV32ri64 64 - MOV64mr %stack.7, 1, $noreg, 0, $noreg, $rcx :: (store (s64) into %stack.7) - renamable $cx = MOV16ri 64 - MOV16mr %stack.5, 1, $noreg, 0, $noreg, $cx :: (store (s16) into %stack.5) - renamable $cx = MOV16ri 16 - renamable $r8w = MOV16ri 16 - MOV16mr %stack.6, 1, $noreg, 0, $noreg, $r8w :: (store (s16) into %stack.6) - PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.4, align 4) - renamable $r9 = COPY $rsi - $rsi = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7) - renamable $r8 = COPY $rdi - $di = MOV16rm %stack.6, 1, $noreg, 0, $noreg :: (load (s16) from %stack.6) - renamable $r10 = COPY $rax - $ax = MOV16rm %stack.5, 1, $noreg, 0, $noreg :: (load (s16) from %stack.5) - renamable $tmm4_tmm5 = PT2RPNTLVWZ0V renamable $ax, renamable $cx, renamable $di, renamable $rdx, 1, killed renamable $r10, 0, $noreg - renamable $tmm0 = COPY renamable $tmm5 - renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5 - PTILESTOREDV renamable $ax, renamable $cx, renamable $r9, 1, renamable $rsi, 0, $noreg, killed renamable $tmm1 - PTILESTOREDV renamable $ax, renamable $di, renamable $r8, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0 - renamable $tmm0 = PTILEZEROV renamable $ax, renamable $cx - PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0 - renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg - renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r8, 1, renamable $rsi, 0, $noreg - renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg - renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2 - PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0 -... diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir deleted file mode 100644 index c7d241f..0000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir +++ /dev/null @@ -1,153 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ -# RUN: -mattr=+amx-transpose -run-pass=greedy,tileconfig -o - %s | FileCheck %s - ---- | - @buf = dso_local global [2048 x i8] zeroinitializer, align 16 - @buf2 = dso_local global [2048 x i8] zeroinitializer, align 16 - - define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 { - entry: - %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, i8* getelementptr inbounds ([2048 x i8], [2048 x i8]* @buf, i64 0, i64 0), i64 32) #5 - %1 = extractvalue { x86_amx, x86_amx } %0, 0 - %2 = extractvalue { x86_amx, x86_amx } %0, 1 - %3 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #5 - %4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %3, x86_amx %1, x86_amx %2) #5 - tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, i8* getelementptr inbounds ([2048 x i8], [2048 x i8]* @buf2, i64 0, i64 0), i64 32, x86_amx %4) #5 - ret void - } - - declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, i8*, i64) #1 - - declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2 - - declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3 - - declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3 - - declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2 - - declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) #4 - - attributes #0 = { nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" } - attributes #1 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #2 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #3 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #4 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #5 = { nounwind } - -... ---- -name: test_tile_2rpntlvwz0 -alignment: 16 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -failedISel: false -tracksRegLiveness: true -hasWinCFI: false -callsEHReturn: false -callsUnwindInit: false -hasEHContTarget: false -hasEHScopes: false -hasEHFunclets: false -failsVerification: false -tracksDebugUserValues: false -registers: - - { id: 0, class: gr32, preferred-register: '' } - - { id: 1, class: gr32, preferred-register: '' } - - { id: 2, class: gr32, preferred-register: '' } - - { id: 3, class: gr16, preferred-register: '' } - - { id: 4, class: gr16, preferred-register: '' } - - { id: 5, class: gr16, preferred-register: '' } - - { id: 6, class: gr64, preferred-register: '' } - - { id: 7, class: gr64_nosp, preferred-register: '' } - - { id: 8, class: tilepair, preferred-register: '' } - - { id: 9, class: tile, preferred-register: '' } - - { id: 10, class: tile, preferred-register: '' } - - { id: 11, class: tile, preferred-register: '' } - - { id: 12, class: tile, preferred-register: '' } - - { id: 13, class: gr64, preferred-register: '' } - - { id: 14, class: vr512, preferred-register: '' } -liveins: - - { reg: '$edi', virtual-reg: '%0' } - - { reg: '$esi', virtual-reg: '%1' } - - { reg: '$edx', virtual-reg: '%2' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 4 - adjustsStack: false - hasCalls: false - stackProtector: '' - functionContext: '' - maxCallFrameSize: 4294967295 - cvBytesOfCalleeSavedRegisters: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false - hasTailCall: false - localFrameSize: 0 - savePoint: [] - restorePoint: [] -fixedStack: [] -stack: - - { id: 0, name: '', type: default, offset: 0, size: 64, alignment: 4, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -callSites: [] -debugValueSubstitutions: [] -constants: [] -machineFunctionInfo: - amxProgModel: ManagedRA -body: | - bb.0.entry: - liveins: $edi, $esi, $edx - - - ; CHECK-LABEL: name: test_tile_2rpntlvwz0 - ; CHECK: liveins: $edi, $esi, $edx - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $edi - ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0 - ; CHECK-NEXT: VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.0, align 4) - ; CHECK-NEXT: MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4) - ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 26, $noreg, [[COPY]].sub_16bit :: (store (s512) into %stack.0 + 26, align 2, basealign 4) - ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 53, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 53, align 1, basealign 4) - ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 24, $noreg, [[COPY1]].sub_16bit :: (store (s512) into %stack.0 + 24, align 4) - ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 52, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 52, align 4) - ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 16, $noreg, [[COPY]].sub_16bit :: (store (s512) into %stack.0 + 16, align 4) - ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 48, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 48, align 4) - ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4) - ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64 = MOV32ri64 @buf - ; CHECK-NEXT: [[MOV32ri64_1:%[0-9]+]]:gr64_nosp = MOV32ri64 32 - ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[COPY2]].sub_16bit, [[COPY1]].sub_16bit, [[COPY]].sub_16bit, [[MOV32ri64_]], 1, [[MOV32ri64_1]], 0, $noreg - ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY2]].sub_16bit, [[COPY1]].sub_16bit - ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY2]].sub_16bit, [[COPY]].sub_16bit, [[COPY1]].sub_16bit, [[PTILEZEROV]], [[PT2RPNTLVWZ0V]].sub_t0, [[PT2RPNTLVWZ0V]].sub_t1 - ; CHECK-NEXT: [[MOV32ri64_2:%[0-9]+]]:gr64 = MOV32ri64 @buf2 - ; CHECK-NEXT: PTILESTOREDV [[COPY2]].sub_16bit, [[COPY1]].sub_16bit, [[MOV32ri64_2]], 1, [[MOV32ri64_1]], 0, $noreg, [[PTILEZEROV]] - ; CHECK-NEXT: RET 0 - %2:gr32 = COPY $edx - %1:gr32 = COPY $esi - %0:gr32 = COPY $edi - %14:vr512 = AVX512_512_SET0 - VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, %14 :: (store (s512) into %stack.0, align 4) - MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4) - PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4) - %6:gr64 = MOV32ri64 @buf - %7:gr64_nosp = MOV32ri64 32 - %8:tilepair = PT2RPNTLVWZ0V %0.sub_16bit, %1.sub_16bit, %2.sub_16bit, %6, 1, %7, 0, $noreg - %12:tile = PTILEZEROV %0.sub_16bit, %1.sub_16bit - %12:tile = PTDPBSSDV %0.sub_16bit, %2.sub_16bit, %1.sub_16bit, %12, %8.sub_t0, %8.sub_t1 - %13:gr64 = MOV32ri64 @buf2 - PTILESTOREDV %0.sub_16bit, %1.sub_16bit, %13, 1, %7, 0, $noreg, %12 - RET 0 - -... diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir b/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir deleted file mode 100644 index 66b15aa..0000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir +++ /dev/null @@ -1,97 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ -# RUN: -mattr=+amx-transpose -run-pass=lowertilecopy -o - %s | FileCheck %s - ---- -name: test_tile_2rpntlvwz0 -alignment: 16 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -failedISel: false -tracksRegLiveness: true -hasWinCFI: false -callsEHReturn: false -callsUnwindInit: false -hasEHContTarget: false -hasEHScopes: false -hasEHFunclets: false -failsVerification: false -tracksDebugUserValues: false -registers: [] -liveins: - - { reg: '$edi', virtual-reg: '' } - - { reg: '$esi', virtual-reg: '' } - - { reg: '$edx', virtual-reg: '' } - - { reg: '$cx', virtual-reg: '' } - - { reg: '$r9', virtual-reg: '' } - - { reg: '$r10', virtual-reg: '' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 1024 - adjustsStack: false - hasCalls: true - stackProtector: '' - functionContext: '' - maxCallFrameSize: 4294967295 - cvBytesOfCalleeSavedRegisters: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false - hasTailCall: false - localFrameSize: 0 - savePoint: [] - restorePoint: [] -fixedStack: [] -stack: - - { id: 43, name: '', type: default, offset: 0, size: 64, alignment: 4, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 68, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -callSites: [] -debugValueSubstitutions: [] -constants: [] -machineFunctionInfo: - amxProgModel: ManagedRA -body: | - bb.0.entry: - liveins: $edi, $esi, $edx, $cx, $di, $r8w, $r11, $r10, $rbx, $r8, $r9 - - - ; CHECK-LABEL: name: test_tile_2rpntlvwz0 - ; CHECK: liveins: $edi, $esi, $edx, $cx, $di, $r8w, $r11, $r10, $rbx, $r8, $r9 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4) - ; CHECK-NEXT: renamable $tmm4_tmm5 = PT2RPNTLVWZ0V killed renamable $cx, killed renamable $di, killed renamable $r8w, killed renamable $r11, 1, killed renamable $rbx, 0, $noreg - ; CHECK-NEXT: $rax = MOV64ri 64 - ; CHECK-NEXT: TILESTORED %stack.3, 1, $rax, 0, $noreg, $tmm5 :: (store (s8192) into %stack.3) - ; CHECK-NEXT: $tmm0 = TILELOADD %stack.3, 1, killed $rax, 0, $noreg :: (load (s8192) from %stack.3) - ; CHECK-NEXT: $rax = MOV64ri 64 - ; CHECK-NEXT: TILESTORED %stack.2, 1, $rax, 0, $noreg, $tmm4 :: (store (s8192) into %stack.2) - ; CHECK-NEXT: $tmm1 = TILELOADD %stack.2, 1, killed $rax, 0, $noreg :: (load (s8192) from %stack.2) - ; CHECK-NEXT: renamable $r8 = MOV32ri64 64 - ; CHECK-NEXT: MOV64mr %stack.1, 1, $noreg, 0, $noreg, $r8 :: (store (s64) into %stack.1) - ; CHECK-NEXT: renamable $di = MOV16ri 64 - ; CHECK-NEXT: renamable $cx = MOV16ri 16 - ; CHECK-NEXT: PTILESTOREDV renamable $cx, renamable $di, killed renamable $r10, 1, renamable $r8, 0, $noreg, killed renamable $tmm1 - ; CHECK-NEXT: PTILESTOREDV killed renamable $cx, killed renamable $di, killed renamable $r9, 1, renamable $r8, 0, $noreg, killed renamable $tmm0 - PLDTILECFGV %stack.43, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.43, align 4) - renamable $tmm4_tmm5 = PT2RPNTLVWZ0V killed renamable $cx, killed renamable $di, killed renamable $r8w, killed renamable $r11, 1, killed renamable $rbx, 0, $noreg - renamable $tmm0 = COPY renamable $tmm5 - renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5 - renamable $r8 = MOV32ri64 64 - MOV64mr %stack.68, 1, $noreg, 0, $noreg, $r8 :: (store (s64) into %stack.68) - renamable $di = MOV16ri 64 - renamable $cx = MOV16ri 16 - PTILESTOREDV renamable $cx, renamable $di, killed renamable $r10, 1, renamable $r8, 0, $noreg, killed renamable $tmm1 - PTILESTOREDV killed renamable $cx, killed renamable $di, killed renamable $r9, 1, renamable $r8, 0, $noreg, killed renamable $tmm0 - -... diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll deleted file mode 100644 index 3549875..0000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll +++ /dev/null @@ -1,87 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py - ; RUN: opt --codegen-opt-level=0 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s - ; RUN: opt --codegen-opt-level=0 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s - - @buf = dso_local global [2048 x i8] zeroinitializer, align 16 - - ; Function Attrs: noinline nounwind optnone uwtable - define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1, ptr %m) #0 { -; CHECK-LABEL: @test_tile_2rpntlvwz0( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = udiv i16 [[COL1:%.*]], 4 -; CHECK-NEXT: [[TMP1:%.*]] = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 [[ROW:%.*]], i16 [[COL0:%.*]], i16 [[COL1]], ptr @buf, i64 32) #[[ATTR3:[0-9]+]] -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP1]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[COL0]] to i64 -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M:%.*]], i64 [[TMP3]], x86_amx [[TMP2]]) -; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP1]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[COL1]] to i64 -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL1]], ptr [[M]], i64 [[TMP6]], x86_amx [[TMP5]]) -; CHECK-NEXT: [[TMP8:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW]], i16 [[COL0]]) #[[ATTR3]] -; CHECK-NEXT: [[TMP9:%.*]] = sext i16 [[COL0]] to i64 -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP9]], x86_amx [[TMP8]]) -; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[COL0]] to i64 -; CHECK-NEXT: [[TMP13:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP11]]) -; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[COL1]] to i64 -; CHECK-NEXT: [[TMP16:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL1]], ptr [[M]], i64 [[TMP14]]) -; CHECK-NEXT: [[TMP17:%.*]] = sext i16 [[COL0]] to i64 -; CHECK-NEXT: [[TMP19:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP0]], i16 [[COL0]], ptr [[M]], i64 [[TMP17]]) -; CHECK-NEXT: [[TMP20:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 [[ROW]], i16 [[COL0]], i16 [[COL1]], x86_amx [[TMP13]], x86_amx [[TMP16]], x86_amx [[TMP19]]) #[[ATTR3]] -; CHECK-NEXT: [[TMP21:%.*]] = sext i16 [[COL0]] to i64 -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP21]], x86_amx [[TMP20]]) -; CHECK-NEXT: ret void -; - entry: - - %0 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr getelementptr inbounds ([2048 x i8], ptr @buf, i64 0, i64 0), i64 32) #7 - %1 = extractvalue { x86_amx, x86_amx } %0, 0 - %2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #7 - store <256 x i32> %2, ptr %m, align 1024 - - %3 = extractvalue { x86_amx, x86_amx } %0, 1 - %4 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #7 - store <256 x i32> %4, ptr %m, align 1024 - - %5 = call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #7 - %6 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #7 - store <256 x i32> %6, ptr %m, align 64 - - %7 = load <256 x i32>, ptr %m, align 64 - %8 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %7) #7 - %9 = load <256 x i32>, ptr %m, align 64 - %10 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %9) #7 - %11 = load <256 x i32>, ptr %m, align 64 - %12 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #7 - - %13 = call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col0, i16 %col1, x86_amx %8, x86_amx %10, x86_amx %12) #7 - %14 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %13) #7 - store <256 x i32> %14, ptr %m, align 64 - - ret void - } - - ; Function Attrs: argmemonly nounwind readonly - declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #2 - - ; Function Attrs: nounwind readnone - declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #3 - - ; Function Attrs: nounwind - declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #4 - - ; Function Attrs: nounwind - declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #4 - - ; Function Attrs: nounwind readnone - declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #3 - - ; Function Attrs: argmemonly nounwind writeonly - declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #5 - - attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" } - attributes #1 = { argmemonly nofree nounwind willreturn writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #2 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #3 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #4 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #5 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #6 = { argmemonly nofree nounwind willreturn "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #7 = { nounwind } diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll deleted file mode 100644 index 96966264..0000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll +++ /dev/null @@ -1,61 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s -; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s - - @buf = dso_local global [2048 x i8] zeroinitializer, align 16 - @buf2 = dso_local global [2048 x i8] zeroinitializer, align 16 - - ; Function Attrs: nounwind uwtable - define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 { -; CHECK-LABEL: @test_tile_2rpntlvwz0( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 [[ROW:%.*]], i16 [[COL0:%.*]], i16 [[COL1:%.*]], ptr @buf, i64 32) #[[ATTR3:[0-9]+]] -; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP0]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP0]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW]], i16 [[COL0]]) #[[ATTR3]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[ROW]], i16 [[COL1]], i16 [[COL0]], x86_amx [[TMP3]], x86_amx [[TMP1]], x86_amx [[TMP2]]) #[[ATTR3]] -; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr @buf2, i64 32, x86_amx [[TMP4]]) #[[ATTR3]] -; CHECK-NEXT: ret void -; - entry: - %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr @buf, i64 32) #5 - %1 = extractvalue { x86_amx, x86_amx } %0, 0 - %2 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #5 - %3 = extractvalue { x86_amx, x86_amx } %0, 1 - %4 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #5 - %5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #5 - %6 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #5 - %7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6) #5 - %8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %2) #5 - %9 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4) #5 - %10 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %7, x86_amx %8, x86_amx %9) #5 - %11 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) #5 - %12 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #5 - tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, ptr @buf2, i64 32, x86_amx %12) #5 - ret void - } - - ; Function Attrs: argmemonly nounwind readonly - declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #1 - - ; Function Attrs: nounwind readnone - declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2 - - ; Function Attrs: nounwind - declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3 - - ; Function Attrs: nounwind - declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3 - - ; Function Attrs: nounwind readnone - declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2 - - ; Function Attrs: argmemonly nounwind writeonly - declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #4 - - attributes #0 = { nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" } - attributes #1 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #2 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #3 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #4 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #5 = { nounwind } diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir deleted file mode 100644 index 1e3b242..0000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir +++ /dev/null @@ -1,134 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ -# RUN: -mattr=+amx-transpose -run-pass=fastpretileconfig -o - %s | FileCheck %s - ---- -name: test_tile_2rpntlvwz0 -alignment: 16 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -failedISel: false -tracksRegLiveness: true -hasWinCFI: false -callsEHReturn: false -callsUnwindInit: false -hasEHContTarget: false -hasEHScopes: false -hasEHFunclets: false -failsVerification: false -tracksDebugUserValues: false -registers: - - { id: 0, class: gr64_nosp, preferred-register: '' } - - { id: 1, class: gr16, preferred-register: '' } - - { id: 2, class: gr16, preferred-register: '' } - - { id: 3, class: gr16, preferred-register: '' } - - { id: 4, class: gr64, preferred-register: '' } - - { id: 5, class: gr64, preferred-register: '' } - - { id: 6, class: gr64, preferred-register: '' } - - { id: 7, class: gr64_nosp, preferred-register: '' } - - { id: 8, class: tilepair, preferred-register: '' } - - { id: 9, class: tile, preferred-register: '' } - - { id: 10, class: tile, preferred-register: '' } - - { id: 11, class: tile, preferred-register: '' } - - { id: 181, class: tile, preferred-register: '' } - - { id: 183, class: tile, preferred-register: '' } - - { id: 185, class: tile, preferred-register: '' } - - { id: 186, class: tile, preferred-register: '' } -liveins: - - { reg: '$edi', virtual-reg: '%0' } - - { reg: '$esi', virtual-reg: '%1' } - - { reg: '$edx', virtual-reg: '%2' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 1024 - adjustsStack: false - hasCalls: true - stackProtector: '' - functionContext: '' - maxCallFrameSize: 4294967295 - cvBytesOfCalleeSavedRegisters: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false - hasTailCall: false - localFrameSize: 0 - savePoint: [] - restorePoint: [] -fixedStack: [] -stack: - - { id: 18, name: '', type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 19, name: '', type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 20, name: '', type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 21, name: '', type: default, offset: 0, size: 8, - alignment: 8, stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -callSites: [] -debugValueSubstitutions: [] -constants: [] -machineFunctionInfo: - amxProgModel: ManagedRA -body: | - bb.0.entry: - liveins: $rdi, $rsi, $rdx, $rax - - ; CHECK-LABEL: name: test_tile_2rpntlvwz0 - ; CHECK: liveins: $rdi, $rsi, $rdx, $rax - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0 - ; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.4, align 4) - ; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4) - ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 64 - ; CHECK-NEXT: [[MOV16ri:%[0-9]+]]:gr16 = MOV16ri 64 - ; CHECK-NEXT: [[MOV16ri1:%[0-9]+]]:gr16 = MOV16ri 16 - ; CHECK-NEXT: [[MOV16ri2:%[0-9]+]]:gr16 = MOV16ri 16 - ; CHECK-NEXT: PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load (s512) from %stack.4, align 4) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rsi - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY $rdx - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64_nosp = COPY $rax - ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[MOV16ri]], [[MOV16ri1]], [[MOV16ri2]], [[COPY2]], 1, killed [[COPY3]], 0, $noreg - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t1 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t0 - ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri1]], [[COPY]], 1, [[MOV32ri64_]], 0, $noreg, killed [[COPY5]] - ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri2]], [[COPY1]], 1, [[MOV32ri64_]], 0, $noreg, killed [[COPY4]] - ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[MOV16ri]], [[MOV16ri1]] - ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri1]], [[COPY2]], 1, [[MOV32ri64_]], 0, $noreg, killed [[PTILEZEROV]] - ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri1]], [[COPY]], 1, [[MOV32ri64_]], 0, $noreg - ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri2]], [[COPY1]], 1, [[MOV32ri64_]], 0, $noreg - ; CHECK-NEXT: [[PTILELOADDV2:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri1]], [[COPY2]], 1, [[MOV32ri64_]], 0, $noreg - ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[MOV16ri]], [[MOV16ri1]], [[MOV16ri2]], [[PTILELOADDV]], killed [[PTILELOADDV1]], killed [[PTILELOADDV2]] - ; CHECK-NEXT: PTILESTOREDV killed [[MOV16ri]], killed [[MOV16ri1]], killed [[COPY2]], 1, killed [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]] - %0:gr64_nosp = MOV32ri64 64 - %1:gr16 = MOV16ri 64 - %2:gr16 = MOV16ri 16 - %3:gr16 = MOV16ri 16 - %4:gr64 = COPY $rsi - %5:gr64 = COPY $rdi - %6:gr64 = COPY $rdx - %7:gr64_nosp = COPY $rax - %8:tilepair = PT2RPNTLVWZ0V %1, %2, %3, %6, 1, killed %7, 0, $noreg - %9:tile = COPY %8.sub_t1 - %10:tile = COPY %8.sub_t0 - PTILESTOREDV %1, %2, %4, 1, %0, 0, $noreg, killed %10 - PTILESTOREDV %1, %3, %5, 1, %0, 0, $noreg, killed %9 - %11:tile = PTILEZEROV %1, %2 - PTILESTOREDV %1, %2, %6, 1, %0, 0, $noreg, killed %11 - %181:tile = PTILELOADDV %1, %2, %4, 1, %0, 0, $noreg - %183:tile = PTILELOADDV %1, %3, %5, 1, %0, 0, $noreg - %185:tile = PTILELOADDV %1, %2, %6, 1, %0, 0, $noreg - %186:tile = PTDPBSSDV %1, %2, %3, %181, killed %183, killed %185 - PTILESTOREDV killed %1, killed %2, killed %6, 1, killed %0, 0, $noreg, killed %186 -... diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir deleted file mode 100644 index ac2cdb4..0000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir +++ /dev/null @@ -1,113 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ -# RUN: -mattr=+amx-transpose -run-pass=tilepreconfig -o - %s | FileCheck %s - ---- -name: test_tile_2rpntlvwz0 -alignment: 16 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -failedISel: false -tracksRegLiveness: true -hasWinCFI: false -callsEHReturn: false -callsUnwindInit: false -hasEHContTarget: false -hasEHScopes: false -hasEHFunclets: false -failsVerification: false -tracksDebugUserValues: false -registers: - - { id: 0, class: gr32, preferred-register: '' } - - { id: 1, class: gr32, preferred-register: '' } - - { id: 2, class: gr32, preferred-register: '' } - - { id: 3, class: gr16, preferred-register: '' } - - { id: 4, class: gr16, preferred-register: '' } - - { id: 5, class: gr16, preferred-register: '' } - - { id: 6, class: gr64, preferred-register: '' } - - { id: 7, class: gr64_nosp, preferred-register: '' } - - { id: 8, class: tilepair, preferred-register: '' } - - { id: 9, class: tile, preferred-register: '' } - - { id: 10, class: tile, preferred-register: '' } - - { id: 11, class: tile, preferred-register: '' } - - { id: 12, class: tile, preferred-register: '' } - - { id: 13, class: gr64, preferred-register: '' } -liveins: - - { reg: '$edi', virtual-reg: '%0' } - - { reg: '$esi', virtual-reg: '%1' } - - { reg: '$edx', virtual-reg: '%2' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 1 - adjustsStack: false - hasCalls: false - stackProtector: '' - functionContext: '' - maxCallFrameSize: 4294967295 - cvBytesOfCalleeSavedRegisters: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false - hasTailCall: false - localFrameSize: 0 - savePoint: [] - restorePoint: [] -fixedStack: [] -stack: [] -callSites: [] -debugValueSubstitutions: [] -constants: [] -machineFunctionInfo: - amxProgModel: ManagedRA -body: | - bb.0.entry: - liveins: $edi, $esi, $edx, $rax, $rbx - - ; CHECK-LABEL: name: test_tile_2rpntlvwz0 - ; CHECK: liveins: $edi, $esi, $edx, $rax, $rbx - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0 - ; CHECK-NEXT: VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.0, align 4) - ; CHECK-NEXT: MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $edi - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr16 = COPY [[COPY1]].sub_16bit - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gr16 = COPY [[COPY2]].sub_16bit - ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load (s512) from %stack.0, align 4) - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64 = COPY $rax - ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 32 - ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[COPY5]], [[COPY4]], [[COPY3]], killed [[COPY6]], 1, [[MOV32ri64_]], 0, $noreg - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t1 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t0 - ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY5]], [[COPY4]] - ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[COPY5]], [[COPY3]], [[COPY4]], [[PTILEZEROV]], killed [[COPY8]], killed [[COPY7]] - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64 = COPY $rbx - ; CHECK-NEXT: PTILESTOREDV [[COPY5]], [[COPY4]], killed [[COPY9]], 1, [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]] - ; CHECK-NEXT: RET 0 - %2:gr32 = COPY $edx - %1:gr32 = COPY $esi - %0:gr32 = COPY $edi - %3:gr16 = COPY %2.sub_16bit - %4:gr16 = COPY %1.sub_16bit - %5:gr16 = COPY %0.sub_16bit - %6:gr64 = COPY $rax - %7:gr64_nosp = MOV32ri64 32 - %8:tilepair = PT2RPNTLVWZ0V %5, %4, %3, killed %6, 1, %7, 0, $noreg - %9:tile = COPY %8.sub_t1 - %10:tile = COPY %8.sub_t0 - %11:tile = PTILEZEROV %5, %4 - %12:tile = PTDPBSSDV %5, %3, %4, %11, killed %10, killed %9 - %13:gr64 = COPY $rbx - PTILESTOREDV %5, %4, killed %13, 1, %7, 0, $noreg, killed %12 - RET 0 - -... diff --git a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll deleted file mode 100644 index 4cfd97a..0000000 --- a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll +++ /dev/null @@ -1,371 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR - -define void @test_amx(i32 %rv32, i64 %stride, i64 %rvalue, i8* %addr1, <4 x float> %xmm) #0 { -; CHECK-LABEL: test_amx: -; CHECK: # %bb.0: -; CHECK-NEXT: t2rpntlvwz0 (%rcx,%rsi), %tmm0 -; CHECK-NEXT: t2rpntlvwz0t1 (%rcx,%rsi), %tmm2 -; CHECK-NEXT: t2rpntlvwz1 (%rcx,%rsi), %tmm0 -; CHECK-NEXT: t2rpntlvwz1t1 (%rcx,%rsi), %tmm2 -; CHECK-NEXT: ttransposed %tmm3, %tmm1 -; CHECK-NEXT: ttdpbf16ps %tmm3, %tmm2, %tmm1 -; CHECK-NEXT: ttdpfp16ps %tmm6, %tmm5, %tmm4 -; CHECK-NEXT: ttcmmimfp16ps %tmm3, %tmm2, %tmm1 -; CHECK-NEXT: ttcmmrlfp16ps %tmm3, %tmm2, %tmm1 -; CHECK-NEXT: tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1 -; CHECK-NEXT: tconjtfp16 %tmm2, %tmm1 -; CHECK-NEXT: retq -; -; EGPR-LABEL: test_amx: -; EGPR: # %bb.0: -; EGPR-NEXT: t2rpntlvwz0 (%rcx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x04,0x31] -; EGPR-NEXT: t2rpntlvwz0t1 (%rcx,%rsi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x31] -; EGPR-NEXT: t2rpntlvwz1 (%rcx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x04,0x31] -; EGPR-NEXT: t2rpntlvwz1t1 (%rcx,%rsi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x31] -; EGPR-NEXT: ttransposed %tmm3, %tmm1 # encoding: [0xc4,0xe2,0x7a,0x5f,0xcb] -; EGPR-NEXT: ttdpbf16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x62,0x6c,0xca] -; EGPR-NEXT: ttdpfp16ps %tmm6, %tmm5, %tmm4 # encoding: [0xc4,0xe2,0x4b,0x6c,0xe5] -; EGPR-NEXT: ttcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x63,0x6b,0xca] -; EGPR-NEXT: ttcmmrlfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x62,0x6b,0xca] -; EGPR-NEXT: tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x60,0x6b,0xca] -; EGPR-NEXT: tconjtfp16 %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x79,0x6b,0xca] -; EGPR-NEXT: retq # encoding: [0xc3] - call void @llvm.x86.t2rpntlvwz0(i8 1, i8* %addr1, i64 %stride) - call void @llvm.x86.t2rpntlvwz0t1(i8 2, i8* %addr1, i64 %stride) - call void @llvm.x86.t2rpntlvwz1(i8 1, i8* %addr1, i64 %stride) - call void @llvm.x86.t2rpntlvwz1t1(i8 2, i8* %addr1, i64 %stride) - call void @llvm.x86.ttransposed(i8 1, i8 3) - call void @llvm.x86.ttdpbf16ps(i8 1, i8 2, i8 3) - call void @llvm.x86.ttdpfp16ps(i8 4, i8 5, i8 6) - call void @llvm.x86.ttcmmimfp16ps(i8 1, i8 2, i8 3) - call void @llvm.x86.ttcmmrlfp16ps(i8 1, i8 2, i8 3) - call void @llvm.x86.tconjtcmmimfp16ps(i8 1, i8 2, i8 3) - call void @llvm.x86.tconjtfp16(i8 1, i8 2) - ret void -} - -declare void @llvm.x86.t2rpntlvwz0(i8 %tile1, i8* %addr1, i64 %stride) -declare void @llvm.x86.t2rpntlvwz0t1(i8 %tile1, i8* %addr1, i64 %stride) -declare void @llvm.x86.t2rpntlvwz1(i8 %tile1, i8* %addr1, i64 %stride) -declare void @llvm.x86.t2rpntlvwz1t1(i8 %tile1, i8* %addr1, i64 %stride) -declare void @llvm.x86.ttransposed(i8 %tile0, i8 %tile1) -declare void @llvm.x86.ttdpbf16ps(i8 %tile0, i8 %tile1, i8 %tile2) -declare void @llvm.x86.ttdpfp16ps(i8 %tile0, i8 %tile1, i8 %tile2) -declare void @llvm.x86.ttcmmimfp16ps(i8 %A, i8 %B, i8 %C) -declare void @llvm.x86.ttcmmrlfp16ps(i8 %A, i8 %B, i8 %C) -declare void @llvm.x86.tconjtcmmimfp16ps(i8 %A, i8 %B, i8 %C) -declare void @llvm.x86.tconjtfp16(i8 %A, i8 %B) - -define void @test_amx2(i8* %pointer, i8* %base, i64 %stride) #0 { -; CHECK-LABEL: test_amx2: -; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: subq $2928, %rsp # imm = 0xB70 -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, %ax -; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm0 -; CHECK-NEXT: tilezero %tmm1 -; CHECK-NEXT: tilezero %tmm2 -; CHECK-NEXT: ttdpbf16ps %tmm1, %tmm0, %tmm2 -; CHECK-NEXT: ttdpfp16ps %tmm1, %tmm0, %tmm2 -; CHECK-NEXT: ttcmmimfp16ps %tmm1, %tmm0, %tmm2 -; CHECK-NEXT: ttcmmrlfp16ps %tmm1, %tmm0, %tmm2 -; CHECK-NEXT: movabsq $64, %rbp -; CHECK-NEXT: tilestored %tmm2, 896(%rsp,%rbp) # 1024-byte Folded Spill -; CHECK-NEXT: tileloadd 896(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload -; CHECK-NEXT: tconjtcmmimfp16ps %tmm1, %tmm0, %tmm3 -; CHECK-NEXT: tconjtfp16 %tmm3, %tmm0 -; CHECK-NEXT: tilestored %tmm2, (%rdi,%rdx) -; CHECK-NEXT: addq $2928, %rsp # imm = 0xB70 -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: tilerelease -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -; -; EGPR-LABEL: test_amx2: -; EGPR: # %bb.0: -; EGPR-NEXT: pushq %rbp # encoding: [0x55] -; EGPR-NEXT: subq $2928, %rsp # encoding: [0x48,0x81,0xec,0x70,0x0b,0x00,0x00] -; EGPR-NEXT: # imm = 0xB70 -; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0] -; EGPR-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0x0d] -; EGPR-NEXT: movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x40,0x03,0x00,0x00,0x01] -; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x70,0x03,0x00,0x00,0x08] -; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x50,0x03,0x00,0x00,0x08,0x00] -; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x71,0x03,0x00,0x00,0x08] -; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x52,0x03,0x00,0x00,0x08,0x00] -; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x72,0x03,0x00,0x00,0x08] -; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x54,0x03,0x00,0x00,0x08,0x00] -; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x73,0x03,0x00,0x00,0x08] -; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x56,0x03,0x00,0x00,0x08,0x00] -; EGPR-NEXT: ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0x40,0x03,0x00,0x00] -; EGPR-NEXT: movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00] -; EGPR-NEXT: tileloadd (%rsi,%rdx), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x04,0x16] -; EGPR-NEXT: tilezero %tmm1 # encoding: [0xc4,0xe2,0x7b,0x49,0xc8] -; EGPR-NEXT: tilezero %tmm2 # encoding: [0xc4,0xe2,0x7b,0x49,0xd0] -; EGPR-NEXT: ttdpbf16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x72,0x6c,0xd0] -; EGPR-NEXT: ttdpfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x6c,0xd0] -; EGPR-NEXT: ttcmmimfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x6b,0xd0] -; EGPR-NEXT: ttcmmrlfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x72,0x6b,0xd0] -; EGPR-NEXT: movabsq $64, %rbp # encoding: [0x48,0xbd,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00] -; EGPR-NEXT: tilestored %tmm2, 896(%rsp,%rbp) # 1024-byte Folded Spill -; EGPR-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x94,0x2c,0x80,0x03,0x00,0x00] -; EGPR-NEXT: tileloadd 896(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload -; EGPR-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x9c,0x2c,0x80,0x03,0x00,0x00] -; EGPR-NEXT: tconjtcmmimfp16ps %tmm1, %tmm0, %tmm3 # encoding: [0xc4,0xe2,0x70,0x6b,0xd8] -; EGPR-NEXT: tconjtfp16 %tmm3, %tmm0 # encoding: [0xc4,0xe2,0x79,0x6b,0xc3] -; EGPR-NEXT: tilestored %tmm2, (%rdi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x14,0x17] -; EGPR-NEXT: addq $2928, %rsp # encoding: [0x48,0x81,0xc4,0x70,0x0b,0x00,0x00] -; EGPR-NEXT: # imm = 0xB70 -; EGPR-NEXT: popq %rbp # encoding: [0x5d] -; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] -; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] -; EGPR-NEXT: retq # encoding: [0xc3] - - %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) - %b = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) - %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) - %c1 = call x86_amx @llvm.x86.ttdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b) - %c2 = call x86_amx @llvm.x86.ttdpfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c1, x86_amx %a, x86_amx %b) - %c3 = call x86_amx @llvm.x86.ttcmmimfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c2, x86_amx %a, x86_amx %b) - %c4 = call x86_amx @llvm.x86.ttcmmrlfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c3, x86_amx %a, x86_amx %b) - %c5 = call x86_amx @llvm.x86.tconjtcmmimfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c4, x86_amx %a, x86_amx %b) - %c6 = call x86_amx @llvm.x86.tconjtfp16.internal(i16 8, i16 8, x86_amx %c5) - - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c4) - ret void -} - -define void @test_amx3(i8* %pointer, i8* %base, i64 %stride) #0 { -; CHECK-LABEL: test_amx3: -; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: movw $8, %cx -; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 -; CHECK-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm4 -; CHECK-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm4 -; CHECK-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm4 -; CHECK-NEXT: ttransposed %tmm4, %tmm0 -; CHECK-NEXT: tilestored %tmm0, (%rdi,%rdx) -; CHECK-NEXT: tilerelease -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -; -; EGPR-LABEL: test_amx3: -; EGPR: # %bb.0: -; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0] -; EGPR-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0xff] -; EGPR-NEXT: movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xc0,0x01] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf0,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd0,0x08,0x00] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf4,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd8,0x08,0x00] -; EGPR-NEXT: movb $0, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf5,0x00] -; EGPR-NEXT: movw $0, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xda,0x00,0x00] -; EGPR-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0xc0] -; EGPR-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; EGPR-NEXT: movw $8, %cx # encoding: [0x66,0xb9,0x08,0x00] -; EGPR-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x24,0x16] -; EGPR-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x24,0x16] -; EGPR-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x24,0x16] -; EGPR-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x24,0x16] -; EGPR-NEXT: ttransposed %tmm4, %tmm0 # encoding: [0xc4,0xe2,0x7a,0x5f,0xc4] -; EGPR-NEXT: tilestored %tmm0, (%rdi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x04,0x17] -; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] -; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] -; EGPR-NEXT: retq # encoding: [0xc3] - %1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) - %2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) - %3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) - %4 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) - %5 = extractvalue { x86_amx, x86_amx } %4, 0 - %6 = call x86_amx @llvm.x86.ttransposed.internal(i16 8, i16 8, x86_amx %5) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %6) - ret void -} - -define void @test_amx_spill(i8* %pointer, i8* %base, i64 %stride) #0 { -; CHECK-LABEL: test_amx_spill: -; CHECK: # %bb.0: -; CHECK-NEXT: subq $6088, %rsp # imm = 0x17C8 -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, %ax -; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm0 -; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 -; CHECK-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm6 -; CHECK-NEXT: movabsq $64, %rcx -; CHECK-NEXT: tilestored %tmm6, 4032(%rsp,%rcx) # 1024-byte Folded Spill -; CHECK-NEXT: tilestored %tmm7, 5056(%rsp,%rcx) # 1024-byte Folded Spill -; CHECK-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm6 -; CHECK-NEXT: tilestored %tmm6, 1984(%rsp,%rcx) # 1024-byte Folded Spill -; CHECK-NEXT: tilestored %tmm7, 3008(%rsp,%rcx) # 1024-byte Folded Spill -; CHECK-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm6 -; CHECK-NEXT: tilestored %tmm6, -64(%rsp,%rcx) # 1024-byte Folded Spill -; CHECK-NEXT: tilestored %tmm7, 960(%rsp,%rcx) # 1024-byte Folded Spill -; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm6 -; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) -; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) -; CHECK-NEXT: tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload -; CHECK-NEXT: tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload -; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) -; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) -; CHECK-NEXT: tileloadd 1984(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload -; CHECK-NEXT: tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload -; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) -; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) -; CHECK-NEXT: tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload -; CHECK-NEXT: tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload -; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) -; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) -; CHECK-NEXT: tilestored %tmm6, (%rsi,%rdx) -; CHECK-NEXT: tilestored %tmm7, (%rsi,%rdx) -; CHECK-NEXT: addq $6088, %rsp # imm = 0x17C8 -; CHECK-NEXT: tilerelease -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -; -; EGPR-LABEL: test_amx_spill: -; EGPR: # %bb.0: -; EGPR-NEXT: subq $6088, %rsp # encoding: [0x48,0x81,0xec,0xc8,0x17,0x00,0x00] -; EGPR-NEXT: # imm = 0x17C8 -; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0] -; EGPR-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0xfe] -; EGPR-NEXT: movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0x80,0x01] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb0,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x90,0x08,0x00] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb4,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x98,0x08,0x00] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb5,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9a,0x08,0x00] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb6,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9c,0x08,0x00] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb7,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9e,0x08,0x00] -; EGPR-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0x80] -; EGPR-NEXT: movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00] -; EGPR-NEXT: tileloadd (%rsi,%rdx), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x04,0x16] -; EGPR-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x24,0x16] -; EGPR-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x34,0x16] -; EGPR-NEXT: movabsq $64, %rcx # encoding: [0x48,0xb9,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00] -; EGPR-NEXT: tilestored %tmm6, 4032(%rsp,%rcx) # 1024-byte Folded Spill -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xb4,0x0c,0xc0,0x0f,0x00,0x00] -; EGPR-NEXT: tilestored %tmm7, 5056(%rsp,%rcx) # 1024-byte Folded Spill -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x13,0x00,0x00] -; EGPR-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x34,0x16] -; EGPR-NEXT: tilestored %tmm6, 1984(%rsp,%rcx) # 1024-byte Folded Spill -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xb4,0x0c,0xc0,0x07,0x00,0x00] -; EGPR-NEXT: tilestored %tmm7, 3008(%rsp,%rcx) # 1024-byte Folded Spill -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x0b,0x00,0x00] -; EGPR-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x34,0x16] -; EGPR-NEXT: tilestored %tmm6, -64(%rsp,%rcx) # 1024-byte Folded Spill -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0x74,0x0c,0xc0] -; EGPR-NEXT: tilestored %tmm7, 960(%rsp,%rcx) # 1024-byte Folded Spill -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x03,0x00,0x00] -; EGPR-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x34,0x16] -; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16] -; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16] -; EGPR-NEXT: tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xa4,0x0c,0xc0,0x0f,0x00,0x00] -; EGPR-NEXT: tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x13,0x00,0x00] -; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16] -; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16] -; EGPR-NEXT: tileloadd 1984(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xa4,0x0c,0xc0,0x07,0x00,0x00] -; EGPR-NEXT: tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x0b,0x00,0x00] -; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16] -; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16] -; EGPR-NEXT: tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0x64,0x0c,0xc0] -; EGPR-NEXT: tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x03,0x00,0x00] -; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16] -; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16] -; EGPR-NEXT: tilestored %tmm6, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x34,0x16] -; EGPR-NEXT: tilestored %tmm7, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x3c,0x16] -; EGPR-NEXT: addq $6088, %rsp # encoding: [0x48,0x81,0xc4,0xc8,0x17,0x00,0x00] -; EGPR-NEXT: # imm = 0x17C8 -; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] -; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] -; EGPR-NEXT: retq # encoding: [0xc3] - %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) - %b1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - %b2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - %b3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - %b4 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - %b5 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - %e11 = extractvalue { x86_amx, x86_amx } %b1, 0 - %e12 = extractvalue { x86_amx, x86_amx } %b1, 1 - %e21 = extractvalue { x86_amx, x86_amx } %b2, 0 - %e22 = extractvalue { x86_amx, x86_amx } %b2, 1 - %e31 = extractvalue { x86_amx, x86_amx } %b3, 0 - %e32 = extractvalue { x86_amx, x86_amx } %b3, 1 - %e41 = extractvalue { x86_amx, x86_amx } %b4, 0 - %e42 = extractvalue { x86_amx, x86_amx } %b4, 1 - %e51 = extractvalue { x86_amx, x86_amx } %b5, 0 - %e52 = extractvalue { x86_amx, x86_amx } %b5, 1 - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e11) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e12) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e21) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e22) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e31) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e32) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e41) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e42) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e51) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e52) - ret void -} - -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) -declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, i8*, i64) -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16, i16, i16, i8*, i64) -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16, i16, i16, i8*, i64) -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16, i16, i16, i8*, i64) -declare x86_amx @llvm.x86.ttransposed.internal(i16, i16, x86_amx) -declare x86_amx @llvm.x86.ttdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.ttdpfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.ttcmmimfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.ttcmmrlfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.tconjtcmmimfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.tconjtfp16.internal(i16, i16, x86_amx) - -attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll index 06e7d47..8007d9d 100644 --- a/llvm/test/CodeGen/X86/bittest-big-integer.ll +++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll @@ -203,14 +203,24 @@ define i1 @init_eq_i32(ptr %word, i32 %position, i1 zeroext %value) nounwind { define i1 @test_ne_i64(ptr %word, i32 %position) nounwind { ; X86-LABEL: test_ne_i64: ; X86: # %bb.0: +; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: andl $32, %edx -; X86-NEXT: shrl $3, %edx -; X86-NEXT: movl (%eax,%edx), %eax -; X86-NEXT: btl %ecx, %eax -; X86-NEXT: setb %al +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1, %edx +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: shll %cl, %edx +; X86-NEXT: testb $32, %cl +; X86-NEXT: je .LBB5_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: movl %edx, %esi +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: .LBB5_2: +; X86-NEXT: andl 4(%eax), %esi +; X86-NEXT: andl (%eax), %edx +; X86-NEXT: orl %esi, %edx +; X86-NEXT: setne %al +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: test_ne_i64: @@ -232,20 +242,38 @@ define i1 @test_ne_i64(ptr %word, i32 %position) nounwind { define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind { ; X86-LABEL: complement_ne_i64: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: andl $32, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: movl (%ecx,%esi), %edi -; X86-NEXT: btl %edx, %edi -; X86-NEXT: setb %al -; X86-NEXT: btcl %edx, %edi -; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1, %eax +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: shll %cl, %eax +; X86-NEXT: testb $32, %cl +; X86-NEXT: je .LBB6_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: movl %eax, %esi +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: .LBB6_2: +; X86-NEXT: movl (%edx), %ecx +; X86-NEXT: movl 4(%edx), %edi +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: andl %esi, %ebx +; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: andl %eax, %ebp +; X86-NEXT: xorl %esi, %edi +; X86-NEXT: xorl %eax, %ecx +; X86-NEXT: orl %ebx, %ebp +; X86-NEXT: setne %al +; X86-NEXT: movl %ecx, (%edx) +; X86-NEXT: movl %edi, 4(%edx) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: complement_ne_i64: @@ -272,20 +300,40 @@ define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind { define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind { ; X86-LABEL: reset_eq_i64: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: andl $32, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: movl (%ecx,%esi), %edi -; X86-NEXT: btl %edx, %edi -; X86-NEXT: setae %al -; X86-NEXT: btrl %edx, %edi -; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1, %esi +; X86-NEXT: xorl %edi, %edi +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: shll %cl, %esi +; X86-NEXT: testb $32, %cl +; X86-NEXT: je .LBB7_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: movl %esi, %edi +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: .LBB7_2: +; X86-NEXT: movl (%edx), %eax +; X86-NEXT: movl 4(%edx), %ecx +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: andl %edi, %ebx +; X86-NEXT: notl %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: andl %esi, %ebp +; X86-NEXT: notl %esi +; X86-NEXT: andl %ecx, %edi +; X86-NEXT: andl %eax, %esi +; X86-NEXT: orl %ebx, %ebp +; X86-NEXT: sete %al +; X86-NEXT: movl %esi, (%edx) +; X86-NEXT: movl %edi, 4(%edx) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: reset_eq_i64: @@ -313,20 +361,38 @@ define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind { define i1 @set_ne_i64(ptr %word, i32 %position) nounwind { ; X86-LABEL: set_ne_i64: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: andl $32, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: movl (%ecx,%esi), %edi -; X86-NEXT: btl %edx, %edi -; X86-NEXT: setb %al -; X86-NEXT: btsl %edx, %edi -; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1, %eax +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: shll %cl, %eax +; X86-NEXT: testb $32, %cl +; X86-NEXT: je .LBB8_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: movl %eax, %esi +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: .LBB8_2: +; X86-NEXT: movl (%edx), %ecx +; X86-NEXT: movl 4(%edx), %edi +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: andl %esi, %ebx +; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: andl %eax, %ebp +; X86-NEXT: orl %esi, %edi +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: orl %ebx, %ebp +; X86-NEXT: setne %al +; X86-NEXT: movl %ecx, (%edx) +; X86-NEXT: movl %edi, 4(%edx) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: set_ne_i64: @@ -353,47 +419,52 @@ define i1 @set_ne_i64(ptr %word, i32 %position) nounwind { define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-LABEL: init_eq_i64: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %edx -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: shldl %cl, %eax, %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1, %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: shldl %cl, %eax, %edx ; X86-NEXT: shll %cl, %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %esi +; X86-NEXT: xorl %edi, %edi +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: shll %cl, %esi ; X86-NEXT: testb $32, %cl ; X86-NEXT: je .LBB9_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl $0, %edx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: movl $0, %eax ; X86-NEXT: .LBB9_2: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: notl %esi -; X86-NEXT: notl %edx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: notl %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: notl %ebp ; X86-NEXT: je .LBB9_4 ; X86-NEXT: # %bb.3: -; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl %eax, %eax +; X86-NEXT: movl %esi, %edi +; X86-NEXT: xorl %esi, %esi ; X86-NEXT: .LBB9_4: -; X86-NEXT: andl 4(%ebx), %esi -; X86-NEXT: orl %edi, %esi -; X86-NEXT: andl (%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: andl $32, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: movl (%ebx,%eax), %eax -; X86-NEXT: btl %ecx, %eax -; X86-NEXT: setae %al -; X86-NEXT: movl %esi, 4(%ebx) -; X86-NEXT: movl %edx, (%ebx) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl 4(%ecx), %ecx +; X86-NEXT: andl %ecx, %edx +; X86-NEXT: andl %ecx, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl (%edi), %ecx +; X86-NEXT: andl %ecx, %eax +; X86-NEXT: andl %ecx, %ebp +; X86-NEXT: orl %esi, %ebp +; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl %ebp, (%edi) +; X86-NEXT: movl %ebx, 4(%edi) +; X86-NEXT: sete %al ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; SSE-LABEL: init_eq_i64: @@ -445,25 +516,101 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind { define i1 @test_ne_i128(ptr %word, i32 %position) nounwind { ; X86-LABEL: test_ne_i128: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: andl $96, %edx -; X86-NEXT: shrl $3, %edx -; X86-NEXT: movl (%eax,%edx), %eax -; X86-NEXT: btl %ecx, %eax -; X86-NEXT: setb %al +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $48, %esp +; X86-NEXT: movzbl 12(%ebp), %ecx +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, (%esp) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %esi +; X86-NEXT: movl 24(%esp,%esi), %edi +; X86-NEXT: movl 28(%esp,%esi), %eax +; X86-NEXT: shldl %cl, %edi, %eax +; X86-NEXT: movl 16(%esp,%esi), %edx +; X86-NEXT: movl 20(%esp,%esi), %esi +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl 8(%ebp), %ebx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: andl 8(%ebx), %edi +; X86-NEXT: andl (%ebx), %edx +; X86-NEXT: orl %edi, %edx +; X86-NEXT: andl 12(%ebx), %eax +; X86-NEXT: andl 4(%ebx), %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: setne %al +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; X64-LABEL: test_ne_i128: -; X64: # %bb.0: -; X64-NEXT: movl %esi, %eax -; X64-NEXT: andl $96, %eax -; X64-NEXT: shrl $3, %eax -; X64-NEXT: movl (%rdi,%rax), %eax -; X64-NEXT: btl %esi, %eax -; X64-NEXT: setb %al -; X64-NEXT: retq +; SSE-LABEL: test_ne_i128: +; SSE: # %bb.0: +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: movl $1, %eax +; SSE-NEXT: xorl %edx, %edx +; SSE-NEXT: shldq %cl, %rax, %rdx +; SSE-NEXT: xorl %esi, %esi +; SSE-NEXT: shlq %cl, %rax +; SSE-NEXT: testb $64, %cl +; SSE-NEXT: cmovneq %rax, %rdx +; SSE-NEXT: cmovneq %rsi, %rax +; SSE-NEXT: andq 8(%rdi), %rdx +; SSE-NEXT: andq (%rdi), %rax +; SSE-NEXT: orq %rdx, %rax +; SSE-NEXT: setne %al +; SSE-NEXT: retq +; +; AVX2-LABEL: test_ne_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: movl $1, %edx +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: shldq %cl, %rdx, %rsi +; AVX2-NEXT: shlxq %rcx, %rdx, %rdx +; AVX2-NEXT: testb $64, %cl +; AVX2-NEXT: cmovneq %rdx, %rsi +; AVX2-NEXT: cmovneq %rax, %rdx +; AVX2-NEXT: andq 8(%rdi), %rsi +; AVX2-NEXT: andq (%rdi), %rdx +; AVX2-NEXT: orq %rsi, %rdx +; AVX2-NEXT: setne %al +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_ne_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: movl %esi, %ecx +; AVX512-NEXT: movl $1, %eax +; AVX512-NEXT: xorl %edx, %edx +; AVX512-NEXT: shldq %cl, %rax, %rdx +; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: shlxq %rcx, %rax, %rax +; AVX512-NEXT: testb $64, %cl +; AVX512-NEXT: cmovneq %rax, %rdx +; AVX512-NEXT: cmovneq %rsi, %rax +; AVX512-NEXT: andq 8(%rdi), %rdx +; AVX512-NEXT: andq (%rdi), %rax +; AVX512-NEXT: orq %rdx, %rax +; AVX512-NEXT: setne %al +; AVX512-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -476,33 +623,124 @@ define i1 @test_ne_i128(ptr %word, i32 %position) nounwind { define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind { ; X86-LABEL: complement_ne_i128: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: andl $96, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: movl (%ecx,%esi), %edi -; X86-NEXT: btl %edx, %edi -; X86-NEXT: setb %al -; X86-NEXT: btcl %edx, %edi -; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $80, %esp +; X86-NEXT: movzbl 12(%ebp), %ecx +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 56(%esp,%eax), %esi +; X86-NEXT: movl 60(%esp,%eax), %edx +; X86-NEXT: shldl %cl, %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 48(%esp,%eax), %edi +; X86-NEXT: movl 52(%esp,%eax), %ebx +; X86-NEXT: shldl %cl, %ebx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %ebx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl 8(%eax), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %esi, %eax +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: andl %edi, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl 12(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 4(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ebx, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %esi, 12(%eax) +; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl %ebx, 4(%eax) +; X86-NEXT: setne %al +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; X64-LABEL: complement_ne_i128: -; X64: # %bb.0: -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: andl $96, %ecx -; X64-NEXT: shrl $3, %ecx -; X64-NEXT: movl (%rdi,%rcx), %edx -; X64-NEXT: btl %esi, %edx -; X64-NEXT: setb %al -; X64-NEXT: btcl %esi, %edx -; X64-NEXT: movl %edx, (%rdi,%rcx) -; X64-NEXT: retq +; SSE-LABEL: complement_ne_i128: +; SSE: # %bb.0: +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: movl $1, %edx +; SSE-NEXT: xorl %esi, %esi +; SSE-NEXT: shldq %cl, %rdx, %rsi +; SSE-NEXT: shlq %cl, %rdx +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: testb $64, %cl +; SSE-NEXT: cmovneq %rdx, %rsi +; SSE-NEXT: cmovneq %rax, %rdx +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: movq 8(%rdi), %rcx +; SSE-NEXT: movq %rcx, %r8 +; SSE-NEXT: andq %rsi, %r8 +; SSE-NEXT: movq %rax, %r9 +; SSE-NEXT: andq %rdx, %r9 +; SSE-NEXT: xorq %rcx, %rsi +; SSE-NEXT: xorq %rax, %rdx +; SSE-NEXT: orq %r8, %r9 +; SSE-NEXT: setne %al +; SSE-NEXT: movq %rdx, (%rdi) +; SSE-NEXT: movq %rsi, 8(%rdi) +; SSE-NEXT: retq +; +; AVX-LABEL: complement_ne_i128: +; AVX: # %bb.0: +; AVX-NEXT: movl %esi, %ecx +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: movl $1, %edx +; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: shldq %cl, %rdx, %rsi +; AVX-NEXT: shlxq %rcx, %rdx, %rdx +; AVX-NEXT: testb $64, %cl +; AVX-NEXT: cmovneq %rdx, %rsi +; AVX-NEXT: cmovneq %rax, %rdx +; AVX-NEXT: movq (%rdi), %rax +; AVX-NEXT: movq 8(%rdi), %rcx +; AVX-NEXT: movq %rcx, %r8 +; AVX-NEXT: andq %rsi, %r8 +; AVX-NEXT: movq %rax, %r9 +; AVX-NEXT: andq %rdx, %r9 +; AVX-NEXT: xorq %rcx, %rsi +; AVX-NEXT: xorq %rax, %rdx +; AVX-NEXT: orq %r8, %r9 +; AVX-NEXT: setne %al +; AVX-NEXT: movq %rdx, (%rdi) +; AVX-NEXT: movq %rsi, 8(%rdi) +; AVX-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -517,33 +755,124 @@ define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind { define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind { ; X86-LABEL: reset_eq_i128: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: andl $96, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: movl (%ecx,%esi), %edi -; X86-NEXT: btl %edx, %edi -; X86-NEXT: setae %al -; X86-NEXT: btrl %edx, %edi -; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $80, %esp +; X86-NEXT: movzbl 12(%ebp), %ecx +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 56(%esp,%eax), %edx +; X86-NEXT: movl 60(%esp,%eax), %esi +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 48(%esp,%eax), %esi +; X86-NEXT: movl 52(%esp,%eax), %edi +; X86-NEXT: shldl %cl, %edi, %edx +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: movl 8(%ebp), %ebx +; X86-NEXT: shll %cl, %esi +; X86-NEXT: movl 8(%ebx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edx, %eax +; X86-NEXT: movl (%ebx), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %esi, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 12(%ebx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: movl 4(%ebx), %ebx +; X86-NEXT: andl %ebx, %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: notl %eax +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: notl %ecx +; X86-NEXT: andl %ebx, %ecx +; X86-NEXT: notl %esi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl 8(%ebp), %edi +; X86-NEXT: movl %edx, 8(%edi) +; X86-NEXT: movl %eax, 12(%edi) +; X86-NEXT: movl %esi, (%edi) +; X86-NEXT: movl %ecx, 4(%edi) +; X86-NEXT: sete %al +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; X64-LABEL: reset_eq_i128: -; X64: # %bb.0: -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: andl $96, %ecx -; X64-NEXT: shrl $3, %ecx -; X64-NEXT: movl (%rdi,%rcx), %edx -; X64-NEXT: btl %esi, %edx -; X64-NEXT: setae %al -; X64-NEXT: btrl %esi, %edx -; X64-NEXT: movl %edx, (%rdi,%rcx) -; X64-NEXT: retq +; SSE-LABEL: reset_eq_i128: +; SSE: # %bb.0: +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: movl $1, %edx +; SSE-NEXT: xorl %esi, %esi +; SSE-NEXT: shldq %cl, %rdx, %rsi +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: shlq %cl, %rdx +; SSE-NEXT: testb $64, %cl +; SSE-NEXT: cmovneq %rdx, %rsi +; SSE-NEXT: cmovneq %rax, %rdx +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: movq 8(%rdi), %rcx +; SSE-NEXT: movq %rcx, %r8 +; SSE-NEXT: andq %rsi, %r8 +; SSE-NEXT: notq %rsi +; SSE-NEXT: movq %rax, %r9 +; SSE-NEXT: andq %rdx, %r9 +; SSE-NEXT: notq %rdx +; SSE-NEXT: andq %rcx, %rsi +; SSE-NEXT: andq %rax, %rdx +; SSE-NEXT: orq %r8, %r9 +; SSE-NEXT: sete %al +; SSE-NEXT: movq %rdx, (%rdi) +; SSE-NEXT: movq %rsi, 8(%rdi) +; SSE-NEXT: retq +; +; AVX-LABEL: reset_eq_i128: +; AVX: # %bb.0: +; AVX-NEXT: movl %esi, %ecx +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: movl $1, %edx +; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: shldq %cl, %rdx, %rsi +; AVX-NEXT: shlxq %rcx, %rdx, %rdx +; AVX-NEXT: testb $64, %cl +; AVX-NEXT: cmovneq %rdx, %rsi +; AVX-NEXT: cmovneq %rax, %rdx +; AVX-NEXT: movq (%rdi), %rax +; AVX-NEXT: movq 8(%rdi), %rcx +; AVX-NEXT: andnq %rcx, %rsi, %r8 +; AVX-NEXT: andq %rsi, %rcx +; AVX-NEXT: andnq %rax, %rdx, %rsi +; AVX-NEXT: andq %rdx, %rax +; AVX-NEXT: orq %rcx, %rax +; AVX-NEXT: sete %al +; AVX-NEXT: movq %rsi, (%rdi) +; AVX-NEXT: movq %r8, 8(%rdi) +; AVX-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -559,33 +888,124 @@ define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind { define i1 @set_ne_i128(ptr %word, i32 %position) nounwind { ; X86-LABEL: set_ne_i128: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: andl $96, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: movl (%ecx,%esi), %edi -; X86-NEXT: btl %edx, %edi -; X86-NEXT: setb %al -; X86-NEXT: btsl %edx, %edi -; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $80, %esp +; X86-NEXT: movzbl 12(%ebp), %ecx +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 56(%esp,%eax), %esi +; X86-NEXT: movl 60(%esp,%eax), %edx +; X86-NEXT: shldl %cl, %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 48(%esp,%eax), %edi +; X86-NEXT: movl 52(%esp,%eax), %ebx +; X86-NEXT: shldl %cl, %ebx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %ebx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl 8(%eax), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %esi, %eax +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: andl %edi, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl 12(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 4(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ebx, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %esi, 12(%eax) +; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl %ebx, 4(%eax) +; X86-NEXT: setne %al +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; X64-LABEL: set_ne_i128: -; X64: # %bb.0: -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: andl $96, %ecx -; X64-NEXT: shrl $3, %ecx -; X64-NEXT: movl (%rdi,%rcx), %edx -; X64-NEXT: btl %esi, %edx -; X64-NEXT: setb %al -; X64-NEXT: btsl %esi, %edx -; X64-NEXT: movl %edx, (%rdi,%rcx) -; X64-NEXT: retq +; SSE-LABEL: set_ne_i128: +; SSE: # %bb.0: +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: movl $1, %edx +; SSE-NEXT: xorl %esi, %esi +; SSE-NEXT: shldq %cl, %rdx, %rsi +; SSE-NEXT: shlq %cl, %rdx +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: testb $64, %cl +; SSE-NEXT: cmovneq %rdx, %rsi +; SSE-NEXT: cmovneq %rax, %rdx +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: movq 8(%rdi), %rcx +; SSE-NEXT: movq %rcx, %r8 +; SSE-NEXT: andq %rsi, %r8 +; SSE-NEXT: movq %rax, %r9 +; SSE-NEXT: andq %rdx, %r9 +; SSE-NEXT: orq %rcx, %rsi +; SSE-NEXT: orq %rax, %rdx +; SSE-NEXT: orq %r8, %r9 +; SSE-NEXT: setne %al +; SSE-NEXT: movq %rdx, (%rdi) +; SSE-NEXT: movq %rsi, 8(%rdi) +; SSE-NEXT: retq +; +; AVX-LABEL: set_ne_i128: +; AVX: # %bb.0: +; AVX-NEXT: movl %esi, %ecx +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: movl $1, %edx +; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: shldq %cl, %rdx, %rsi +; AVX-NEXT: shlxq %rcx, %rdx, %rdx +; AVX-NEXT: testb $64, %cl +; AVX-NEXT: cmovneq %rdx, %rsi +; AVX-NEXT: cmovneq %rax, %rdx +; AVX-NEXT: movq (%rdi), %rax +; AVX-NEXT: movq 8(%rdi), %rcx +; AVX-NEXT: movq %rcx, %r8 +; AVX-NEXT: andq %rsi, %r8 +; AVX-NEXT: movq %rax, %r9 +; AVX-NEXT: andq %rdx, %r9 +; AVX-NEXT: orq %rcx, %rsi +; AVX-NEXT: orq %rax, %rdx +; AVX-NEXT: orq %r8, %r9 +; AVX-NEXT: setne %al +; AVX-NEXT: movq %rdx, (%rdi) +; AVX-NEXT: movq %rsi, 8(%rdi) +; AVX-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -606,9 +1026,9 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $96, %esp -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movzbl 16(%ebp), %ebx +; X86-NEXT: subl $128, %esp +; X86-NEXT: movzbl 12(%ebp), %ecx +; X86-NEXT: movzbl 16(%ebp), %eax ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -617,29 +1037,25 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $12, %al -; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %edi -; X86-NEXT: movl 72(%esp,%edi), %edx -; X86-NEXT: movl 76(%esp,%edi), %esi -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 64(%esp,%edi), %ebx -; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NEXT: movl 68(%esp,%edi), %ebx -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: shldl %cl, %ebx, %edx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: shrb $3, %dl +; X86-NEXT: andb $12, %dl +; X86-NEXT: negb %dl +; X86-NEXT: movsbl %dl, %esi +; X86-NEXT: movl 64(%esp,%esi), %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: notl %esi +; X86-NEXT: movl 68(%esp,%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 72(%esp,%esi), %ebx ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: movl 76(%esp,%esi), %edi +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: shldl %cl, %ebx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shll %cl, %edx ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -647,53 +1063,72 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl 8(%ebp), %esi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%esi), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ebx, %eax +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%esi), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ebx, %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl 40(%esp,%eax), %edi -; X86-NEXT: movl 44(%esp,%eax), %esi -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %edi, %esi ; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 12(%ecx), %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 12(%ecx), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ebx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: movl 4(%ecx), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ecx, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: notl %ecx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl 100(%esp,%ecx), %edi +; X86-NEXT: movl 104(%esp,%ecx), %ecx +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movzbl 12(%ebp), %ecx +; X86-NEXT: shldl %cl, %edi, %ebx +; X86-NEXT: orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: notl %esi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl 108(%esp,%ebx), %ebx +; X86-NEXT: shldl %cl, %eax, %ebx +; X86-NEXT: orl %ebx, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: notl %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl 36(%esp,%esi), %esi -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: andl 8(%edx), %eax -; X86-NEXT: orl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: notl %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl 32(%esp,%eax), %eax -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl 8(%ebp), %edi -; X86-NEXT: andl 4(%edi), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl 96(%esp,%ebx), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll %cl, %ebx +; X86-NEXT: orl %ebx, %eax ; X86-NEXT: notl %edx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %eax -; X86-NEXT: andl (%edi), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: andl $96, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: movl (%edi,%eax), %eax -; X86-NEXT: btl %ecx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 12(%edi) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 8(%edi) -; X86-NEXT: movl %ebx, 4(%edi) -; X86-NEXT: movl %edx, (%edi) -; X86-NEXT: setae %al +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl %cl, %ebx, %edi +; X86-NEXT: orl %edi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl 8(%ebp), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 8(%ecx) +; X86-NEXT: movl %esi, 12(%ecx) +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: movl %edx, 4(%ecx) +; X86-NEXT: sete %al ; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -716,84 +1151,86 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; SSE-NEXT: testb $64, %cl ; SSE-NEXT: cmovneq %rsi, %r8 ; SSE-NEXT: cmovneq %r9, %rsi -; SSE-NEXT: notq %r8 ; SSE-NEXT: cmovneq %rax, %rdx ; SSE-NEXT: cmovneq %r9, %rax +; SSE-NEXT: movq (%rdi), %rcx +; SSE-NEXT: movq 8(%rdi), %r9 +; SSE-NEXT: movq %r9, %r10 +; SSE-NEXT: andq %r8, %r10 +; SSE-NEXT: notq %r8 +; SSE-NEXT: movq %rcx, %r11 +; SSE-NEXT: andq %rsi, %r11 ; SSE-NEXT: notq %rsi -; SSE-NEXT: andq 8(%rdi), %r8 +; SSE-NEXT: andq %r9, %r8 ; SSE-NEXT: orq %rdx, %r8 -; SSE-NEXT: andq (%rdi), %rsi +; SSE-NEXT: andq %rcx, %rsi ; SSE-NEXT: orq %rax, %rsi -; SSE-NEXT: movl %ecx, %eax -; SSE-NEXT: andl $96, %eax -; SSE-NEXT: shrl $3, %eax -; SSE-NEXT: movl (%rdi,%rax), %eax -; SSE-NEXT: btl %ecx, %eax -; SSE-NEXT: setae %al -; SSE-NEXT: movq %r8, 8(%rdi) +; SSE-NEXT: orq %r10, %r11 +; SSE-NEXT: sete %al ; SSE-NEXT: movq %rsi, (%rdi) +; SSE-NEXT: movq %r8, 8(%rdi) ; SSE-NEXT: retq ; ; AVX2-LABEL: init_eq_i128: ; AVX2: # %bb.0: ; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: movl $1, %eax -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: shldq %cl, %rax, %rsi -; AVX2-NEXT: movl %edx, %edx +; AVX2-NEXT: movl $1, %esi +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: shldq %cl, %rsi, %rax ; AVX2-NEXT: xorl %r8d, %r8d -; AVX2-NEXT: shldq %cl, %rdx, %r8 +; AVX2-NEXT: movl %edx, %edx ; AVX2-NEXT: xorl %r9d, %r9d -; AVX2-NEXT: shlxq %rcx, %rax, %rax +; AVX2-NEXT: shldq %cl, %rdx, %r9 +; AVX2-NEXT: shlxq %rcx, %rsi, %rsi ; AVX2-NEXT: testb $64, %cl -; AVX2-NEXT: cmovneq %rax, %rsi -; AVX2-NEXT: cmovneq %r9, %rax -; AVX2-NEXT: shlxq %rcx, %rdx, %rdx -; AVX2-NEXT: cmovneq %rdx, %r8 -; AVX2-NEXT: cmovneq %r9, %rdx -; AVX2-NEXT: andnq 8(%rdi), %rsi, %rsi -; AVX2-NEXT: orq %r8, %rsi -; AVX2-NEXT: andnq (%rdi), %rax, %r8 -; AVX2-NEXT: orq %rdx, %r8 -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $96, %eax -; AVX2-NEXT: shrl $3, %eax -; AVX2-NEXT: movl (%rdi,%rax), %eax -; AVX2-NEXT: btl %ecx, %eax -; AVX2-NEXT: setae %al -; AVX2-NEXT: movq %rsi, 8(%rdi) -; AVX2-NEXT: movq %r8, (%rdi) +; AVX2-NEXT: cmovneq %rsi, %rax +; AVX2-NEXT: cmovneq %r8, %rsi +; AVX2-NEXT: shlxq %rcx, %rdx, %rcx +; AVX2-NEXT: cmovneq %rcx, %r9 +; AVX2-NEXT: cmovneq %r8, %rcx +; AVX2-NEXT: movq (%rdi), %rdx +; AVX2-NEXT: movq 8(%rdi), %r8 +; AVX2-NEXT: andnq %r8, %rax, %r10 +; AVX2-NEXT: andq %rax, %r8 +; AVX2-NEXT: andnq %rdx, %rsi, %r11 +; AVX2-NEXT: andq %rsi, %rdx +; AVX2-NEXT: orq %r9, %r10 +; AVX2-NEXT: orq %rcx, %r11 +; AVX2-NEXT: orq %r8, %rdx +; AVX2-NEXT: sete %al +; AVX2-NEXT: movq %r11, (%rdi) +; AVX2-NEXT: movq %r10, 8(%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: init_eq_i128: ; AVX512: # %bb.0: ; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: movl $1, %eax -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: shldq %cl, %rax, %rsi +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: movl $1, %esi ; AVX512-NEXT: xorl %r8d, %r8d -; AVX512-NEXT: shlxq %rcx, %rax, %rax +; AVX512-NEXT: shldq %cl, %rsi, %r8 +; AVX512-NEXT: shlxq %rcx, %rsi, %rsi ; AVX512-NEXT: movl %edx, %edx ; AVX512-NEXT: xorl %r9d, %r9d ; AVX512-NEXT: shldq %cl, %rdx, %r9 ; AVX512-NEXT: testb $64, %cl +; AVX512-NEXT: cmovneq %rsi, %r8 ; AVX512-NEXT: cmovneq %rax, %rsi -; AVX512-NEXT: cmovneq %r8, %rax -; AVX512-NEXT: shlxq %rcx, %rdx, %rdx -; AVX512-NEXT: cmovneq %rdx, %r9 -; AVX512-NEXT: cmovneq %r8, %rdx -; AVX512-NEXT: andnq 8(%rdi), %rsi, %rsi -; AVX512-NEXT: orq %r9, %rsi -; AVX512-NEXT: andnq (%rdi), %rax, %r8 -; AVX512-NEXT: orq %rdx, %r8 -; AVX512-NEXT: movl %ecx, %eax -; AVX512-NEXT: andl $96, %eax -; AVX512-NEXT: shrl $3, %eax -; AVX512-NEXT: movl (%rdi,%rax), %eax -; AVX512-NEXT: btl %ecx, %eax -; AVX512-NEXT: setae %al -; AVX512-NEXT: movq %rsi, 8(%rdi) +; AVX512-NEXT: shlxq %rcx, %rdx, %rcx +; AVX512-NEXT: cmovneq %rcx, %r9 +; AVX512-NEXT: cmovneq %rax, %rcx +; AVX512-NEXT: movq (%rdi), %rax +; AVX512-NEXT: movq 8(%rdi), %rdx +; AVX512-NEXT: andnq %rdx, %r8, %r10 +; AVX512-NEXT: andq %r8, %rdx +; AVX512-NEXT: andnq %rax, %rsi, %r8 +; AVX512-NEXT: andq %rsi, %rax +; AVX512-NEXT: orq %r9, %r10 +; AVX512-NEXT: orq %rcx, %r8 +; AVX512-NEXT: orq %rdx, %rax +; AVX512-NEXT: sete %al ; AVX512-NEXT: movq %r8, (%rdi) +; AVX512-NEXT: movq %r10, 8(%rdi) ; AVX512-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 @@ -815,25 +1252,344 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { define i1 @test_ne_i512(ptr %word, i32 %position) nounwind { ; X86-LABEL: test_ne_i512: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: shrl $3, %edx -; X86-NEXT: andl $60, %edx -; X86-NEXT: movl (%eax,%edx), %eax -; X86-NEXT: btl %ecx, %eax -; X86-NEXT: setb %al +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $224, %esp +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrl $3, %eax +; X86-NEXT: andl $60, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: leal {{[0-9]+}}(%esp), %edx +; X86-NEXT: subl %eax, %edx +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl 24(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 28(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl $31, %ecx +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 56(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 60(%edx), %eax +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 12(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 40(%edx), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 44(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 16(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 20(%edx), %edi +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: shldl %cl, %eax, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 32(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 36(%edx), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %esi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl 52(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl 4(%edx), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl %cl, %edi, %eax +; X86-NEXT: movl 8(%ebp), %ebx +; X86-NEXT: andl 40(%ebx), %eax +; X86-NEXT: andl 8(%ebx), %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 56(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: andl 24(%ebx), %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: orl %esi, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: andl 44(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: andl 12(%ebx), %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: andl 60(%edi), %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 28(%edi), %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: orl %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%edx), %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl %cl, %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: negl %edx +; X86-NEXT: movl 192(%esp,%edx), %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl %cl, %ebx, %edx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl 8(%ebp), %ebx +; X86-NEXT: andl 32(%ebx), %ecx +; X86-NEXT: andl (%ebx), %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: andl 16(%ebx), %edi +; X86-NEXT: andl 48(%ebx), %edx +; X86-NEXT: orl %edi, %edx +; X86-NEXT: orl %esi, %edx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 36(%ebx), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: andl 4(%ebx), %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 20(%ebx), %ecx +; X86-NEXT: andl 52(%ebx), %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: orl %edx, %eax +; X86-NEXT: setne %al +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; X64-LABEL: test_ne_i512: -; X64: # %bb.0: -; X64-NEXT: movl %esi, %eax -; X64-NEXT: shrl $3, %eax -; X64-NEXT: andl $60, %eax -; X64-NEXT: movl (%rdi,%rax), %eax -; X64-NEXT: btl %esi, %eax -; X64-NEXT: setb %al -; X64-NEXT: retq +; SSE-LABEL: test_ne_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: andl $63, %ecx +; SSE-NEXT: shrl $3, %esi +; SSE-NEXT: andl $56, %esi +; SSE-NEXT: negl %esi +; SSE-NEXT: movslq %esi, %rbx +; SSE-NEXT: movq -48(%rsp,%rbx), %rdx +; SSE-NEXT: movq -40(%rsp,%rbx), %r14 +; SSE-NEXT: movq %r14, %rax +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq -16(%rsp,%rbx), %r11 +; SSE-NEXT: movq -8(%rsp,%rbx), %r10 +; SSE-NEXT: shldq %cl, %r11, %r10 +; SSE-NEXT: movq -32(%rsp,%rbx), %r9 +; SSE-NEXT: movq -24(%rsp,%rbx), %r15 +; SSE-NEXT: movq %r15, %r8 +; SSE-NEXT: shldq %cl, %r9, %r8 +; SSE-NEXT: movq -56(%rsp,%rbx), %rsi +; SSE-NEXT: shldq %cl, %rsi, %rdx +; SSE-NEXT: shldq %cl, %r15, %r11 +; SSE-NEXT: shldq %cl, %r14, %r9 +; SSE-NEXT: movq -64(%rsp,%rbx), %rbx +; SSE-NEXT: shldq %cl, %rbx, %rsi +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: shlq %cl, %rbx +; SSE-NEXT: andq 32(%rdi), %r9 +; SSE-NEXT: andq 48(%rdi), %r11 +; SSE-NEXT: andq 16(%rdi), %rdx +; SSE-NEXT: orq %r11, %rdx +; SSE-NEXT: andq 40(%rdi), %r8 +; SSE-NEXT: andq 56(%rdi), %r10 +; SSE-NEXT: andq 24(%rdi), %rax +; SSE-NEXT: orq %r10, %rax +; SSE-NEXT: andq (%rdi), %rbx +; SSE-NEXT: orq %r9, %rbx +; SSE-NEXT: orq %rdx, %rbx +; SSE-NEXT: andq 8(%rdi), %rsi +; SSE-NEXT: orq %r8, %rsi +; SSE-NEXT: orq %rax, %rsi +; SSE-NEXT: orq %rbx, %rsi +; SSE-NEXT: setne %al +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: retq +; +; AVX2-LABEL: test_ne_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: shrl $3, %esi +; AVX2-NEXT: andl $56, %esi +; AVX2-NEXT: negl %esi +; AVX2-NEXT: movslq %esi, %rsi +; AVX2-NEXT: movq -48(%rsp,%rsi), %rdx +; AVX2-NEXT: movq -40(%rsp,%rsi), %rbx +; AVX2-NEXT: movq %rbx, %rax +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq -16(%rsp,%rsi), %r11 +; AVX2-NEXT: movq -8(%rsp,%rsi), %r10 +; AVX2-NEXT: shldq %cl, %r11, %r10 +; AVX2-NEXT: movq -32(%rsp,%rsi), %r9 +; AVX2-NEXT: movq -24(%rsp,%rsi), %r14 +; AVX2-NEXT: movq %r14, %r8 +; AVX2-NEXT: shldq %cl, %r9, %r8 +; AVX2-NEXT: movq -64(%rsp,%rsi), %r15 +; AVX2-NEXT: movq -56(%rsp,%rsi), %rsi +; AVX2-NEXT: shldq %cl, %rsi, %rdx +; AVX2-NEXT: shldq %cl, %r14, %r11 +; AVX2-NEXT: shldq %cl, %rbx, %r9 +; AVX2-NEXT: shldq %cl, %r15, %rsi +; AVX2-NEXT: shlxq %rcx, %r15, %rcx +; AVX2-NEXT: andq 32(%rdi), %r9 +; AVX2-NEXT: andq 48(%rdi), %r11 +; AVX2-NEXT: andq 16(%rdi), %rdx +; AVX2-NEXT: andq 40(%rdi), %r8 +; AVX2-NEXT: andq 56(%rdi), %r10 +; AVX2-NEXT: andq 24(%rdi), %rax +; AVX2-NEXT: orq %r11, %rdx +; AVX2-NEXT: orq %r10, %rax +; AVX2-NEXT: andq (%rdi), %rcx +; AVX2-NEXT: orq %r9, %rcx +; AVX2-NEXT: orq %rdx, %rcx +; AVX2-NEXT: andq 8(%rdi), %rsi +; AVX2-NEXT: orq %r8, %rsi +; AVX2-NEXT: orq %rax, %rsi +; AVX2-NEXT: orq %rcx, %rsi +; AVX2-NEXT: setne %al +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_ne_i512: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movl %esi, %ecx +; AVX512-NEXT: andl $63, %ecx +; AVX512-NEXT: shrl $3, %esi +; AVX512-NEXT: andl $56, %esi +; AVX512-NEXT: negl %esi +; AVX512-NEXT: movslq %esi, %rbx +; AVX512-NEXT: movq -48(%rsp,%rbx), %rdx +; AVX512-NEXT: movq -40(%rsp,%rbx), %r14 +; AVX512-NEXT: movq %r14, %rax +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq -16(%rsp,%rbx), %r11 +; AVX512-NEXT: movq -8(%rsp,%rbx), %r10 +; AVX512-NEXT: shldq %cl, %r11, %r10 +; AVX512-NEXT: movq -32(%rsp,%rbx), %r9 +; AVX512-NEXT: movq -24(%rsp,%rbx), %r15 +; AVX512-NEXT: movq %r15, %r8 +; AVX512-NEXT: shldq %cl, %r9, %r8 +; AVX512-NEXT: movq -56(%rsp,%rbx), %rsi +; AVX512-NEXT: shldq %cl, %rsi, %rdx +; AVX512-NEXT: shldq %cl, %r15, %r11 +; AVX512-NEXT: shldq %cl, %r14, %r9 +; AVX512-NEXT: movq -64(%rsp,%rbx), %rbx +; AVX512-NEXT: shldq %cl, %rbx, %rsi +; AVX512-NEXT: shlxq %rcx, %rbx, %rcx +; AVX512-NEXT: andq 32(%rdi), %r9 +; AVX512-NEXT: andq 48(%rdi), %r11 +; AVX512-NEXT: andq 16(%rdi), %rdx +; AVX512-NEXT: andq 40(%rdi), %r8 +; AVX512-NEXT: andq 56(%rdi), %r10 +; AVX512-NEXT: andq 24(%rdi), %rax +; AVX512-NEXT: orq %r11, %rdx +; AVX512-NEXT: orq %r10, %rax +; AVX512-NEXT: andq (%rdi), %rcx +; AVX512-NEXT: orq %r9, %rcx +; AVX512-NEXT: orq %rdx, %rcx +; AVX512-NEXT: andq 8(%rdi), %rsi +; AVX512-NEXT: orq %r8, %rsi +; AVX512-NEXT: orq %rax, %rsi +; AVX512-NEXT: orq %rcx, %rsi +; AVX512-NEXT: setne %al +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %rem = and i32 %position, 511 %ofs = zext nneg i32 %rem to i512 %bit = shl nuw i512 1, %ofs @@ -846,33 +1602,572 @@ define i1 @test_ne_i512(ptr %word, i32 %position) nounwind { define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind { ; X86-LABEL: complement_ne_i512: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: andl $60, %esi -; X86-NEXT: movl (%ecx,%esi), %edi -; X86-NEXT: btl %edx, %edi -; X86-NEXT: setb %al -; X86-NEXT: btcl %edx, %edi -; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $272, %esp # imm = 0x110 +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrl $3, %eax +; X86-NEXT: andl $60, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: leal {{[0-9]+}}(%esp), %edx +; X86-NEXT: subl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl 24(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 28(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl $31, %ecx +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 56(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 60(%edx), %esi +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 12(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 40(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 44(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 16(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 20(%edx), %ebx +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 32(%edx), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 36(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl 52(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 4(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%ebp), %edx +; X86-NEXT: movl 40(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %esi, %eax +; X86-NEXT: movl 8(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ebx, %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl 56(%edx), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edi, %ebx +; X86-NEXT: movl 24(%edx), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: orl %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 44(%eax), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl 12(%eax), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: orl %esi, %ebx +; X86-NEXT: movl 60(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %edx, %eax +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl 28(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: negl %eax +; X86-NEXT: movl 240(%esp,%eax), %esi +; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%ebp), %esi +; X86-NEXT: movl 32(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edi, %eax +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edx, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl 16(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ebx, %eax +; X86-NEXT: movl 48(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 36(%esi), %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl 20(%esi), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: andl %esi, %edi +; X86-NEXT: movl 52(%eax), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: orl %edi, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: xorl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: xorl %ecx, (%esp) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl 8(%ebp), %edx +; X86-NEXT: movl %ebx, 60(%edx) +; X86-NEXT: movl %edi, 56(%edx) +; X86-NEXT: movl %ecx, 52(%edx) +; X86-NEXT: movl %esi, 44(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 40(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 36(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 32(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 28(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 24(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 20(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 16(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 12(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 8(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 4(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, (%edx) +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 48(%edx) +; X86-NEXT: setne %al +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; X64-LABEL: complement_ne_i512: -; X64: # %bb.0: -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: shrl $3, %ecx -; X64-NEXT: andl $60, %ecx -; X64-NEXT: movl (%rdi,%rcx), %edx -; X64-NEXT: btl %esi, %edx -; X64-NEXT: setb %al -; X64-NEXT: btcl %esi, %edx -; X64-NEXT: movl %edx, (%rdi,%rcx) -; X64-NEXT: retq +; SSE-LABEL: complement_ne_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %rbp +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %r13 +; SSE-NEXT: pushq %r12 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: subq $56, %rsp +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: andl $63, %ecx +; SSE-NEXT: shrl $3, %esi +; SSE-NEXT: andl $56, %esi +; SSE-NEXT: negl %esi +; SSE-NEXT: movslq %esi, %rbx +; SSE-NEXT: movq (%rsp,%rbx), %rsi +; SSE-NEXT: movq 8(%rsp,%rbx), %r14 +; SSE-NEXT: movq %r14, %rax +; SSE-NEXT: shldq %cl, %rsi, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 32(%rsp,%rbx), %r8 +; SSE-NEXT: movq 40(%rsp,%rbx), %rbp +; SSE-NEXT: shldq %cl, %r8, %rbp +; SSE-NEXT: movq 16(%rsp,%rbx), %r9 +; SSE-NEXT: movq 24(%rsp,%rbx), %r15 +; SSE-NEXT: movq %r15, %r10 +; SSE-NEXT: shldq %cl, %r9, %r10 +; SSE-NEXT: movq -8(%rsp,%rbx), %r11 +; SSE-NEXT: shldq %cl, %r11, %rsi +; SSE-NEXT: shldq %cl, %r15, %r8 +; SSE-NEXT: shldq %cl, %r14, %r9 +; SSE-NEXT: movq -16(%rsp,%rbx), %rbx +; SSE-NEXT: shldq %cl, %rbx, %r11 +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: shlq %cl, %rbx +; SSE-NEXT: movq 24(%rdi), %r15 +; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 56(%rdi), %rcx +; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 16(%rdi), %r12 +; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 48(%rdi), %r13 +; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq %r8, %r13 +; SSE-NEXT: andq %rsi, %r12 +; SSE-NEXT: orq %r13, %r12 +; SSE-NEXT: movq %rcx, %r13 +; SSE-NEXT: andq %rbp, %r13 +; SSE-NEXT: andq %rax, %r15 +; SSE-NEXT: orq %r13, %r15 +; SSE-NEXT: movq 32(%rdi), %r14 +; SSE-NEXT: movq %r14, %rcx +; SSE-NEXT: andq %r9, %rcx +; SSE-NEXT: movq (%rdi), %r13 +; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq %rbx, %r13 +; SSE-NEXT: orq %rcx, %r13 +; SSE-NEXT: orq %r12, %r13 +; SSE-NEXT: movq 40(%rdi), %rcx +; SSE-NEXT: movq %rcx, %r12 +; SSE-NEXT: andq %r10, %r12 +; SSE-NEXT: movq 8(%rdi), %rdx +; SSE-NEXT: movq %rdx, %rax +; SSE-NEXT: andq %r11, %rax +; SSE-NEXT: orq %r12, %rax +; SSE-NEXT: orq %r15, %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload +; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; SSE-NEXT: xorq %rcx, %r10 +; SSE-NEXT: xorq %r14, %r9 +; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; SSE-NEXT: xorq %rdx, %r11 +; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; SSE-NEXT: orq %r13, %rax +; SSE-NEXT: movq %r8, 48(%rdi) +; SSE-NEXT: movq %rbp, 56(%rdi) +; SSE-NEXT: movq %r9, 32(%rdi) +; SSE-NEXT: movq %r10, 40(%rdi) +; SSE-NEXT: movq %rsi, 16(%rdi) +; SSE-NEXT: movq %r15, 24(%rdi) +; SSE-NEXT: movq %rbx, (%rdi) +; SSE-NEXT: movq %r11, 8(%rdi) +; SSE-NEXT: setne %al +; SSE-NEXT: addq $56, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r12 +; SSE-NEXT: popq %r13 +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: popq %rbp +; SSE-NEXT: retq +; +; AVX2-LABEL: complement_ne_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $72, %rsp +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] +; AVX2-NEXT: vmovups %ymm0, (%rsp) +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: shrl $3, %esi +; AVX2-NEXT: andl $56, %esi +; AVX2-NEXT: negl %esi +; AVX2-NEXT: movslq %esi, %rbx +; AVX2-NEXT: movq 16(%rsp,%rbx), %rsi +; AVX2-NEXT: movq 24(%rsp,%rbx), %rbp +; AVX2-NEXT: movq %rbp, %rax +; AVX2-NEXT: shldq %cl, %rsi, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 48(%rsp,%rbx), %r8 +; AVX2-NEXT: movq 56(%rsp,%rbx), %r13 +; AVX2-NEXT: shldq %cl, %r8, %r13 +; AVX2-NEXT: movq 32(%rsp,%rbx), %r9 +; AVX2-NEXT: movq 40(%rsp,%rbx), %r14 +; AVX2-NEXT: movq %r14, %r10 +; AVX2-NEXT: shldq %cl, %r9, %r10 +; AVX2-NEXT: movq 8(%rsp,%rbx), %r11 +; AVX2-NEXT: shldq %cl, %r11, %rsi +; AVX2-NEXT: shldq %cl, %r14, %r8 +; AVX2-NEXT: movq 16(%rdi), %r12 +; AVX2-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 48(%rdi), %r14 +; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: andq %r8, %r14 +; AVX2-NEXT: andq %rsi, %r12 +; AVX2-NEXT: orq %r14, %r12 +; AVX2-NEXT: movq 56(%rdi), %r15 +; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: andq %r13, %r15 +; AVX2-NEXT: movq 24(%rdi), %r14 +; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: andq %rax, %r14 +; AVX2-NEXT: orq %r15, %r14 +; AVX2-NEXT: shldq %cl, %rbp, %r9 +; AVX2-NEXT: movq (%rsp,%rbx), %rdx +; AVX2-NEXT: movq 32(%rdi), %r15 +; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: andq %r9, %r15 +; AVX2-NEXT: shlxq %rcx, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq (%rdi), %rbx +; AVX2-NEXT: movq %rbx, %rbp +; AVX2-NEXT: andq %rax, %rbp +; AVX2-NEXT: orq %r15, %rbp +; AVX2-NEXT: orq %r12, %rbp +; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX2-NEXT: shldq %cl, %rdx, %r11 +; AVX2-NEXT: movq 40(%rdi), %rax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: andq %r10, %rcx +; AVX2-NEXT: movq 8(%rdi), %r15 +; AVX2-NEXT: movq %r15, %r12 +; AVX2-NEXT: andq %r11, %r12 +; AVX2-NEXT: orq %rcx, %r12 +; AVX2-NEXT: orq %r14, %r12 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; AVX2-NEXT: xorq %rax, %r10 +; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; AVX2-NEXT: xorq %r15, %r11 +; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; AVX2-NEXT: orq %rbp, %r12 +; AVX2-NEXT: movq %r8, 48(%rdi) +; AVX2-NEXT: movq %r13, 56(%rdi) +; AVX2-NEXT: movq %r9, 32(%rdi) +; AVX2-NEXT: movq %r10, 40(%rdi) +; AVX2-NEXT: movq %rsi, 16(%rdi) +; AVX2-NEXT: movq %rcx, 24(%rdi) +; AVX2-NEXT: movq %rbx, (%rdi) +; AVX2-NEXT: movq %r11, 8(%rdi) +; AVX2-NEXT: setne %al +; AVX2-NEXT: addq $72, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: complement_ne_i512: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $72, %rsp +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] +; AVX512-NEXT: vmovups %ymm0, (%rsp) +; AVX512-NEXT: movl %esi, %ecx +; AVX512-NEXT: andl $63, %ecx +; AVX512-NEXT: shrl $3, %esi +; AVX512-NEXT: andl $56, %esi +; AVX512-NEXT: negl %esi +; AVX512-NEXT: movslq %esi, %rbx +; AVX512-NEXT: movq 16(%rsp,%rbx), %rsi +; AVX512-NEXT: movq 24(%rsp,%rbx), %rbp +; AVX512-NEXT: movq %rbp, %rax +; AVX512-NEXT: shldq %cl, %rsi, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 48(%rsp,%rbx), %r8 +; AVX512-NEXT: movq 56(%rsp,%rbx), %r13 +; AVX512-NEXT: shldq %cl, %r8, %r13 +; AVX512-NEXT: movq 32(%rsp,%rbx), %r9 +; AVX512-NEXT: movq 40(%rsp,%rbx), %r14 +; AVX512-NEXT: movq %r14, %r10 +; AVX512-NEXT: shldq %cl, %r9, %r10 +; AVX512-NEXT: movq 8(%rsp,%rbx), %r11 +; AVX512-NEXT: shldq %cl, %r11, %rsi +; AVX512-NEXT: shldq %cl, %r14, %r8 +; AVX512-NEXT: movq 16(%rdi), %r12 +; AVX512-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 48(%rdi), %r14 +; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: andq %r8, %r14 +; AVX512-NEXT: andq %rsi, %r12 +; AVX512-NEXT: orq %r14, %r12 +; AVX512-NEXT: movq 56(%rdi), %r15 +; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: andq %r13, %r15 +; AVX512-NEXT: movq 24(%rdi), %r14 +; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: andq %rax, %r14 +; AVX512-NEXT: orq %r15, %r14 +; AVX512-NEXT: shldq %cl, %rbp, %r9 +; AVX512-NEXT: movq (%rsp,%rbx), %rdx +; AVX512-NEXT: movq 32(%rdi), %r15 +; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: andq %r9, %r15 +; AVX512-NEXT: shlxq %rcx, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq (%rdi), %rbx +; AVX512-NEXT: movq %rbx, %rbp +; AVX512-NEXT: andq %rax, %rbp +; AVX512-NEXT: orq %r15, %rbp +; AVX512-NEXT: orq %r12, %rbp +; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX512-NEXT: shldq %cl, %rdx, %r11 +; AVX512-NEXT: movq 40(%rdi), %rax +; AVX512-NEXT: movq %rax, %rcx +; AVX512-NEXT: andq %r10, %rcx +; AVX512-NEXT: movq 8(%rdi), %r15 +; AVX512-NEXT: movq %r15, %r12 +; AVX512-NEXT: andq %r11, %r12 +; AVX512-NEXT: orq %rcx, %r12 +; AVX512-NEXT: orq %r14, %r12 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; AVX512-NEXT: xorq %rax, %r10 +; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; AVX512-NEXT: xorq %r15, %r11 +; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; AVX512-NEXT: orq %rbp, %r12 +; AVX512-NEXT: movq %r8, 48(%rdi) +; AVX512-NEXT: movq %r13, 56(%rdi) +; AVX512-NEXT: movq %r9, 32(%rdi) +; AVX512-NEXT: movq %r10, 40(%rdi) +; AVX512-NEXT: movq %rsi, 16(%rdi) +; AVX512-NEXT: movq %rcx, 24(%rdi) +; AVX512-NEXT: movq %rbx, (%rdi) +; AVX512-NEXT: movq %r11, 8(%rdi) +; AVX512-NEXT: setne %al +; AVX512-NEXT: addq $72, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %rem = and i32 %position, 511 %ofs = zext nneg i32 %rem to i512 %bit = shl nuw i512 1, %ofs @@ -887,33 +2182,606 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind { define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind { ; X86-LABEL: reset_eq_i512: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $288, %esp # imm = 0x120 +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrl $3, %eax +; X86-NEXT: andl $60, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: leal {{[0-9]+}}(%esp), %edi +; X86-NEXT: subl %eax, %edi +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl 4(%edi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%edi), %eax +; X86-NEXT: andl $31, %ecx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: shldl %cl, %edx, %ebx +; X86-NEXT: movl 12(%edi), %edx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: andl $60, %esi -; X86-NEXT: movl (%ecx,%esi), %edi -; X86-NEXT: btl %edx, %edi -; X86-NEXT: setae %al -; X86-NEXT: btrl %edx, %edi -; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 16(%edi), %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 20(%edi), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 24(%edi), %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 28(%edi), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 32(%edi), %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 36(%edi), %esi +; X86-NEXT: movl %esi, %edx +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 40(%edi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %esi, %edx +; X86-NEXT: movl 8(%ebp), %esi +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 40(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %eax, %edx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %eax, %ebx +; X86-NEXT: orl %edx, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 44(%edi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl 52(%edi), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 56(%edi), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shldl %cl, %esi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%ebp), %esi +; X86-NEXT: movl 56(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %eax, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 24(%esi), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ecx, %eax +; X86-NEXT: orl %ebx, %eax +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: movl 44(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %eax, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 12(%esi), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ecx, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 60(%edi), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 60(%ebx), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 28(%ebx), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edx, %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%edi), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: negl %eax +; X86-NEXT: movl 256(%esp,%eax), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl %cl, %edi, %eax +; X86-NEXT: movl %esi, %edi +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: movl 32(%ebx), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ecx, %edx +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%ebx), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ecx, %edi +; X86-NEXT: orl %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 16(%esi), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ecx, %ebx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 48(%esi), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ecx, %eax +; X86-NEXT: orl %ebx, %eax +; X86-NEXT: orl %edi, %eax +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl %cl, %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 36(%esi), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ebx, %edx +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 4(%esi), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ebx, %edi +; X86-NEXT: orl %edx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl %cl, %edi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 20(%esi), %edi +; X86-NEXT: andl %edi, %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl 8(%ebp), %ebx +; X86-NEXT: movl 52(%ebx), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ebx, %edx +; X86-NEXT: orl %esi, %edx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: andl %edi, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: notl %esi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: notl %edi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: notl %edi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: notl %edi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: notl %ecx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %edx, 60(%eax) +; X86-NEXT: movl %esi, 56(%eax) +; X86-NEXT: movl %ecx, 52(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 44(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 40(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 36(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 32(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 28(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 24(%eax) +; X86-NEXT: movl %ebx, 20(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 16(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 48(%eax) +; X86-NEXT: sete %al +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; X64-LABEL: reset_eq_i512: -; X64: # %bb.0: -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: shrl $3, %ecx -; X64-NEXT: andl $60, %ecx -; X64-NEXT: movl (%rdi,%rcx), %edx -; X64-NEXT: btl %esi, %edx -; X64-NEXT: setae %al -; X64-NEXT: btrl %esi, %edx -; X64-NEXT: movl %edx, (%rdi,%rcx) -; X64-NEXT: retq +; SSE-LABEL: reset_eq_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %rbp +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %r13 +; SSE-NEXT: pushq %r12 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: subq $56, %rsp +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: andl $63, %ecx +; SSE-NEXT: shrl $3, %esi +; SSE-NEXT: andl $56, %esi +; SSE-NEXT: negl %esi +; SSE-NEXT: movslq %esi, %rdx +; SSE-NEXT: movq (%rsp,%rdx), %r9 +; SSE-NEXT: movq 8(%rsp,%rdx), %r8 +; SSE-NEXT: movq %r8, %rsi +; SSE-NEXT: shldq %cl, %r9, %rsi +; SSE-NEXT: movq -8(%rsp,%rdx), %rax +; SSE-NEXT: shldq %cl, %rax, %r9 +; SSE-NEXT: movq 16(%rsp,%rdx), %r14 +; SSE-NEXT: movq 24(%rsp,%rdx), %r10 +; SSE-NEXT: movq %r10, %rbx +; SSE-NEXT: shldq %cl, %r14, %rbx +; SSE-NEXT: shldq %cl, %r8, %r14 +; SSE-NEXT: movq 32(%rsp,%rdx), %r13 +; SSE-NEXT: movq 40(%rsp,%rdx), %r12 +; SSE-NEXT: shldq %cl, %r13, %r12 +; SSE-NEXT: shldq %cl, %r10, %r13 +; SSE-NEXT: movq -16(%rsp,%rdx), %rdx +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: shlq %cl, %rdx +; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq %r12, %rbp +; SSE-NEXT: movq %r9, %r15 +; SSE-NEXT: movq %rsi, %r11 +; SSE-NEXT: movq 16(%rdi), %r8 +; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 48(%rdi), %rcx +; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq %rcx, %r13 +; SSE-NEXT: andq %r8, %r9 +; SSE-NEXT: orq %r13, %r9 +; SSE-NEXT: movq 56(%rdi), %rcx +; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq %rcx, %r12 +; SSE-NEXT: movq 24(%rdi), %r10 +; SSE-NEXT: andq %r10, %rsi +; SSE-NEXT: orq %r12, %rsi +; SSE-NEXT: movq %r14, %r13 +; SSE-NEXT: movq 32(%rdi), %rcx +; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq %rcx, %r14 +; SSE-NEXT: movq %rdx, %r12 +; SSE-NEXT: movq (%rdi), %rcx +; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq %rcx, %rdx +; SSE-NEXT: orq %r14, %rdx +; SSE-NEXT: orq %r9, %rdx +; SSE-NEXT: movq %rbx, %r14 +; SSE-NEXT: movq 40(%rdi), %rcx +; SSE-NEXT: andq %rcx, %rbx +; SSE-NEXT: movq %rax, %r9 +; SSE-NEXT: movq 8(%rdi), %r8 +; SSE-NEXT: andq %r8, %rax +; SSE-NEXT: orq %rbx, %rax +; SSE-NEXT: orq %rsi, %rax +; SSE-NEXT: notq %r11 +; SSE-NEXT: andq %r10, %r11 +; SSE-NEXT: notq %r15 +; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload +; SSE-NEXT: notq %r14 +; SSE-NEXT: andq %rcx, %r14 +; SSE-NEXT: notq %r13 +; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; SSE-NEXT: notq %rbp +; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE-NEXT: notq %rcx +; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; SSE-NEXT: notq %r9 +; SSE-NEXT: andq %r8, %r9 +; SSE-NEXT: notq %r12 +; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload +; SSE-NEXT: orq %rdx, %rax +; SSE-NEXT: movq %rcx, 48(%rdi) +; SSE-NEXT: movq %rbp, 56(%rdi) +; SSE-NEXT: movq %r13, 32(%rdi) +; SSE-NEXT: movq %r14, 40(%rdi) +; SSE-NEXT: movq %r15, 16(%rdi) +; SSE-NEXT: movq %r11, 24(%rdi) +; SSE-NEXT: movq %r12, (%rdi) +; SSE-NEXT: movq %r9, 8(%rdi) +; SSE-NEXT: sete %al +; SSE-NEXT: addq $56, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r12 +; SSE-NEXT: popq %r13 +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: popq %rbp +; SSE-NEXT: retq +; +; AVX2-LABEL: reset_eq_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: pushq %rax +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: shrl $3, %esi +; AVX2-NEXT: andl $56, %esi +; AVX2-NEXT: negl %esi +; AVX2-NEXT: movslq %esi, %rdx +; AVX2-NEXT: movq -48(%rsp,%rdx), %r8 +; AVX2-NEXT: movq -40(%rsp,%rdx), %rbx +; AVX2-NEXT: movq %rbx, %rax +; AVX2-NEXT: shldq %cl, %r8, %rax +; AVX2-NEXT: movq -16(%rsp,%rdx), %r10 +; AVX2-NEXT: movq -8(%rsp,%rdx), %rsi +; AVX2-NEXT: shldq %cl, %r10, %rsi +; AVX2-NEXT: movq -32(%rsp,%rdx), %r11 +; AVX2-NEXT: movq -24(%rsp,%rdx), %r14 +; AVX2-NEXT: movq %r14, %r9 +; AVX2-NEXT: shldq %cl, %r11, %r9 +; AVX2-NEXT: movq -64(%rsp,%rdx), %r15 +; AVX2-NEXT: movq -56(%rsp,%rdx), %rdx +; AVX2-NEXT: shldq %cl, %rdx, %r8 +; AVX2-NEXT: shldq %cl, %r14, %r10 +; AVX2-NEXT: shldq %cl, %rbx, %r11 +; AVX2-NEXT: shldq %cl, %r15, %rdx +; AVX2-NEXT: shlxq %rcx, %r15, %rcx +; AVX2-NEXT: movq 24(%rdi), %rbx +; AVX2-NEXT: movq 56(%rdi), %r14 +; AVX2-NEXT: movq 16(%rdi), %r15 +; AVX2-NEXT: movq 48(%rdi), %r13 +; AVX2-NEXT: movq 32(%rdi), %rbp +; AVX2-NEXT: andnq %rbp, %r11, %r12 +; AVX2-NEXT: andq %r11, %rbp +; AVX2-NEXT: andnq %r13, %r10, %r11 +; AVX2-NEXT: andq %r10, %r13 +; AVX2-NEXT: andnq %r15, %r8, %r10 +; AVX2-NEXT: andq %r8, %r15 +; AVX2-NEXT: movq 40(%rdi), %r8 +; AVX2-NEXT: orq %r13, %r15 +; AVX2-NEXT: andnq %r8, %r9, %r13 +; AVX2-NEXT: andq %r9, %r8 +; AVX2-NEXT: andnq %r14, %rsi, %r9 +; AVX2-NEXT: andq %rsi, %r14 +; AVX2-NEXT: andnq %rbx, %rax, %rsi +; AVX2-NEXT: andq %rax, %rbx +; AVX2-NEXT: movq (%rdi), %rax +; AVX2-NEXT: orq %r14, %rbx +; AVX2-NEXT: andnq %rax, %rcx, %r14 +; AVX2-NEXT: andq %rcx, %rax +; AVX2-NEXT: orq %rbp, %rax +; AVX2-NEXT: movq 8(%rdi), %rcx +; AVX2-NEXT: orq %r15, %rax +; AVX2-NEXT: andnq %rcx, %rdx, %r15 +; AVX2-NEXT: andq %rdx, %rcx +; AVX2-NEXT: orq %r8, %rcx +; AVX2-NEXT: orq %rbx, %rcx +; AVX2-NEXT: orq %rax, %rcx +; AVX2-NEXT: movq %r11, 48(%rdi) +; AVX2-NEXT: movq %r9, 56(%rdi) +; AVX2-NEXT: movq %r12, 32(%rdi) +; AVX2-NEXT: movq %r13, 40(%rdi) +; AVX2-NEXT: movq %r10, 16(%rdi) +; AVX2-NEXT: movq %rsi, 24(%rdi) +; AVX2-NEXT: movq %r14, (%rdi) +; AVX2-NEXT: movq %r15, 8(%rdi) +; AVX2-NEXT: sete %al +; AVX2-NEXT: addq $8, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: reset_eq_i512: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: pushq %rax +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movl %esi, %ecx +; AVX512-NEXT: andl $63, %ecx +; AVX512-NEXT: shrl $3, %esi +; AVX512-NEXT: andl $56, %esi +; AVX512-NEXT: negl %esi +; AVX512-NEXT: movslq %esi, %rbx +; AVX512-NEXT: movq -48(%rsp,%rbx), %r8 +; AVX512-NEXT: movq -40(%rsp,%rbx), %r14 +; AVX512-NEXT: movq %r14, %rax +; AVX512-NEXT: shldq %cl, %r8, %rax +; AVX512-NEXT: movq -16(%rsp,%rbx), %r10 +; AVX512-NEXT: movq -8(%rsp,%rbx), %rsi +; AVX512-NEXT: shldq %cl, %r10, %rsi +; AVX512-NEXT: movq -32(%rsp,%rbx), %r11 +; AVX512-NEXT: movq -24(%rsp,%rbx), %r15 +; AVX512-NEXT: movq %r15, %r9 +; AVX512-NEXT: shldq %cl, %r11, %r9 +; AVX512-NEXT: movq -56(%rsp,%rbx), %rdx +; AVX512-NEXT: shldq %cl, %rdx, %r8 +; AVX512-NEXT: shldq %cl, %r15, %r10 +; AVX512-NEXT: shldq %cl, %r14, %r11 +; AVX512-NEXT: movq -64(%rsp,%rbx), %rbx +; AVX512-NEXT: shldq %cl, %rbx, %rdx +; AVX512-NEXT: shlxq %rcx, %rbx, %rcx +; AVX512-NEXT: movq 24(%rdi), %rbx +; AVX512-NEXT: movq 56(%rdi), %r14 +; AVX512-NEXT: movq 16(%rdi), %r15 +; AVX512-NEXT: movq 48(%rdi), %r13 +; AVX512-NEXT: movq 32(%rdi), %rbp +; AVX512-NEXT: andnq %rbp, %r11, %r12 +; AVX512-NEXT: andq %r11, %rbp +; AVX512-NEXT: andnq %r13, %r10, %r11 +; AVX512-NEXT: andq %r10, %r13 +; AVX512-NEXT: andnq %r15, %r8, %r10 +; AVX512-NEXT: andq %r8, %r15 +; AVX512-NEXT: movq 40(%rdi), %r8 +; AVX512-NEXT: orq %r13, %r15 +; AVX512-NEXT: andnq %r8, %r9, %r13 +; AVX512-NEXT: andq %r9, %r8 +; AVX512-NEXT: andnq %r14, %rsi, %r9 +; AVX512-NEXT: andq %rsi, %r14 +; AVX512-NEXT: andnq %rbx, %rax, %rsi +; AVX512-NEXT: andq %rax, %rbx +; AVX512-NEXT: movq (%rdi), %rax +; AVX512-NEXT: orq %r14, %rbx +; AVX512-NEXT: andnq %rax, %rcx, %r14 +; AVX512-NEXT: andq %rcx, %rax +; AVX512-NEXT: orq %rbp, %rax +; AVX512-NEXT: movq 8(%rdi), %rcx +; AVX512-NEXT: orq %r15, %rax +; AVX512-NEXT: andnq %rcx, %rdx, %r15 +; AVX512-NEXT: andq %rdx, %rcx +; AVX512-NEXT: orq %r8, %rcx +; AVX512-NEXT: orq %rbx, %rcx +; AVX512-NEXT: orq %rax, %rcx +; AVX512-NEXT: movq %r11, 48(%rdi) +; AVX512-NEXT: movq %r9, 56(%rdi) +; AVX512-NEXT: movq %r12, 32(%rdi) +; AVX512-NEXT: movq %r13, 40(%rdi) +; AVX512-NEXT: movq %r10, 16(%rdi) +; AVX512-NEXT: movq %rsi, 24(%rdi) +; AVX512-NEXT: movq %r14, (%rdi) +; AVX512-NEXT: movq %r15, 8(%rdi) +; AVX512-NEXT: sete %al +; AVX512-NEXT: addq $8, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %rem = and i32 %position, 511 %ofs = zext nneg i32 %rem to i512 %bit = shl nuw i512 1, %ofs @@ -929,33 +2797,572 @@ define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind { define i1 @set_ne_i512(ptr %word, i32 %position) nounwind { ; X86-LABEL: set_ne_i512: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: andl $60, %esi -; X86-NEXT: movl (%ecx,%esi), %edi -; X86-NEXT: btl %edx, %edi -; X86-NEXT: setb %al -; X86-NEXT: btsl %edx, %edi -; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $272, %esp # imm = 0x110 +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrl $3, %eax +; X86-NEXT: andl $60, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: leal {{[0-9]+}}(%esp), %edx +; X86-NEXT: subl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl 24(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 28(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl $31, %ecx +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 56(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 60(%edx), %esi +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 12(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 40(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 44(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 16(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 20(%edx), %ebx +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 32(%edx), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 36(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl 52(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 4(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%ebp), %edx +; X86-NEXT: movl 40(%edx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %esi, %eax +; X86-NEXT: movl 8(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ebx, %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl 56(%edx), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edi, %ebx +; X86-NEXT: movl 24(%edx), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: orl %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 44(%eax), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl 12(%eax), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: orl %esi, %ebx +; X86-NEXT: movl 60(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %edx, %eax +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl 28(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: negl %eax +; X86-NEXT: movl 240(%esp,%eax), %esi +; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%ebp), %esi +; X86-NEXT: movl 32(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edi, %eax +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edx, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl 16(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ebx, %eax +; X86-NEXT: movl 48(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 36(%esi), %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl 20(%esi), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: andl %esi, %edi +; X86-NEXT: movl 52(%eax), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: orl %edi, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl %ecx, (%esp) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl 8(%ebp), %edx +; X86-NEXT: movl %ebx, 60(%edx) +; X86-NEXT: movl %edi, 56(%edx) +; X86-NEXT: movl %ecx, 52(%edx) +; X86-NEXT: movl %esi, 44(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 40(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 36(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 32(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 28(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 24(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 20(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 16(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 12(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 8(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 4(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, (%edx) +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 48(%edx) +; X86-NEXT: setne %al +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; X64-LABEL: set_ne_i512: -; X64: # %bb.0: -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: shrl $3, %ecx -; X64-NEXT: andl $60, %ecx -; X64-NEXT: movl (%rdi,%rcx), %edx -; X64-NEXT: btl %esi, %edx -; X64-NEXT: setb %al -; X64-NEXT: btsl %esi, %edx -; X64-NEXT: movl %edx, (%rdi,%rcx) -; X64-NEXT: retq +; SSE-LABEL: set_ne_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %rbp +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %r13 +; SSE-NEXT: pushq %r12 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: subq $56, %rsp +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: andl $63, %ecx +; SSE-NEXT: shrl $3, %esi +; SSE-NEXT: andl $56, %esi +; SSE-NEXT: negl %esi +; SSE-NEXT: movslq %esi, %rbx +; SSE-NEXT: movq (%rsp,%rbx), %rsi +; SSE-NEXT: movq 8(%rsp,%rbx), %r14 +; SSE-NEXT: movq %r14, %rax +; SSE-NEXT: shldq %cl, %rsi, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 32(%rsp,%rbx), %r8 +; SSE-NEXT: movq 40(%rsp,%rbx), %rbp +; SSE-NEXT: shldq %cl, %r8, %rbp +; SSE-NEXT: movq 16(%rsp,%rbx), %r9 +; SSE-NEXT: movq 24(%rsp,%rbx), %r15 +; SSE-NEXT: movq %r15, %r10 +; SSE-NEXT: shldq %cl, %r9, %r10 +; SSE-NEXT: movq -8(%rsp,%rbx), %r11 +; SSE-NEXT: shldq %cl, %r11, %rsi +; SSE-NEXT: shldq %cl, %r15, %r8 +; SSE-NEXT: shldq %cl, %r14, %r9 +; SSE-NEXT: movq -16(%rsp,%rbx), %rbx +; SSE-NEXT: shldq %cl, %rbx, %r11 +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: shlq %cl, %rbx +; SSE-NEXT: movq 24(%rdi), %r15 +; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 56(%rdi), %rcx +; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 16(%rdi), %r12 +; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 48(%rdi), %r13 +; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq %r8, %r13 +; SSE-NEXT: andq %rsi, %r12 +; SSE-NEXT: orq %r13, %r12 +; SSE-NEXT: movq %rcx, %r13 +; SSE-NEXT: andq %rbp, %r13 +; SSE-NEXT: andq %rax, %r15 +; SSE-NEXT: orq %r13, %r15 +; SSE-NEXT: movq 32(%rdi), %r14 +; SSE-NEXT: movq %r14, %rcx +; SSE-NEXT: andq %r9, %rcx +; SSE-NEXT: movq (%rdi), %r13 +; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq %rbx, %r13 +; SSE-NEXT: orq %rcx, %r13 +; SSE-NEXT: orq %r12, %r13 +; SSE-NEXT: movq 40(%rdi), %rcx +; SSE-NEXT: movq %rcx, %r12 +; SSE-NEXT: andq %r10, %r12 +; SSE-NEXT: movq 8(%rdi), %rdx +; SSE-NEXT: movq %rdx, %rax +; SSE-NEXT: andq %r11, %rax +; SSE-NEXT: orq %r12, %rax +; SSE-NEXT: orq %r15, %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; SSE-NEXT: orq %rcx, %r10 +; SSE-NEXT: orq %r14, %r9 +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; SSE-NEXT: orq %rdx, %r11 +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; SSE-NEXT: orq %r13, %rax +; SSE-NEXT: movq %r8, 48(%rdi) +; SSE-NEXT: movq %rbp, 56(%rdi) +; SSE-NEXT: movq %r9, 32(%rdi) +; SSE-NEXT: movq %r10, 40(%rdi) +; SSE-NEXT: movq %rsi, 16(%rdi) +; SSE-NEXT: movq %r15, 24(%rdi) +; SSE-NEXT: movq %rbx, (%rdi) +; SSE-NEXT: movq %r11, 8(%rdi) +; SSE-NEXT: setne %al +; SSE-NEXT: addq $56, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r12 +; SSE-NEXT: popq %r13 +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: popq %rbp +; SSE-NEXT: retq +; +; AVX2-LABEL: set_ne_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $72, %rsp +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] +; AVX2-NEXT: vmovups %ymm0, (%rsp) +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: shrl $3, %esi +; AVX2-NEXT: andl $56, %esi +; AVX2-NEXT: negl %esi +; AVX2-NEXT: movslq %esi, %rbx +; AVX2-NEXT: movq 16(%rsp,%rbx), %rsi +; AVX2-NEXT: movq 24(%rsp,%rbx), %rbp +; AVX2-NEXT: movq %rbp, %rax +; AVX2-NEXT: shldq %cl, %rsi, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 48(%rsp,%rbx), %r8 +; AVX2-NEXT: movq 56(%rsp,%rbx), %r13 +; AVX2-NEXT: shldq %cl, %r8, %r13 +; AVX2-NEXT: movq 32(%rsp,%rbx), %r9 +; AVX2-NEXT: movq 40(%rsp,%rbx), %r14 +; AVX2-NEXT: movq %r14, %r10 +; AVX2-NEXT: shldq %cl, %r9, %r10 +; AVX2-NEXT: movq 8(%rsp,%rbx), %r11 +; AVX2-NEXT: shldq %cl, %r11, %rsi +; AVX2-NEXT: shldq %cl, %r14, %r8 +; AVX2-NEXT: movq 16(%rdi), %r12 +; AVX2-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 48(%rdi), %r14 +; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: andq %r8, %r14 +; AVX2-NEXT: andq %rsi, %r12 +; AVX2-NEXT: orq %r14, %r12 +; AVX2-NEXT: movq 56(%rdi), %r15 +; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: andq %r13, %r15 +; AVX2-NEXT: movq 24(%rdi), %r14 +; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: andq %rax, %r14 +; AVX2-NEXT: orq %r15, %r14 +; AVX2-NEXT: shldq %cl, %rbp, %r9 +; AVX2-NEXT: movq (%rsp,%rbx), %rdx +; AVX2-NEXT: movq 32(%rdi), %r15 +; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: andq %r9, %r15 +; AVX2-NEXT: shlxq %rcx, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq (%rdi), %rbx +; AVX2-NEXT: movq %rbx, %rbp +; AVX2-NEXT: andq %rax, %rbp +; AVX2-NEXT: orq %r15, %rbp +; AVX2-NEXT: orq %r12, %rbp +; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX2-NEXT: shldq %cl, %rdx, %r11 +; AVX2-NEXT: movq 40(%rdi), %rax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: andq %r10, %rcx +; AVX2-NEXT: movq 8(%rdi), %r15 +; AVX2-NEXT: movq %r15, %r12 +; AVX2-NEXT: andq %r11, %r12 +; AVX2-NEXT: orq %rcx, %r12 +; AVX2-NEXT: orq %r14, %r12 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; AVX2-NEXT: orq %rax, %r10 +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; AVX2-NEXT: orq %r15, %r11 +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; AVX2-NEXT: orq %rbp, %r12 +; AVX2-NEXT: movq %r8, 48(%rdi) +; AVX2-NEXT: movq %r13, 56(%rdi) +; AVX2-NEXT: movq %r9, 32(%rdi) +; AVX2-NEXT: movq %r10, 40(%rdi) +; AVX2-NEXT: movq %rsi, 16(%rdi) +; AVX2-NEXT: movq %rcx, 24(%rdi) +; AVX2-NEXT: movq %rbx, (%rdi) +; AVX2-NEXT: movq %r11, 8(%rdi) +; AVX2-NEXT: setne %al +; AVX2-NEXT: addq $72, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: set_ne_i512: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $72, %rsp +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] +; AVX512-NEXT: vmovups %ymm0, (%rsp) +; AVX512-NEXT: movl %esi, %ecx +; AVX512-NEXT: andl $63, %ecx +; AVX512-NEXT: shrl $3, %esi +; AVX512-NEXT: andl $56, %esi +; AVX512-NEXT: negl %esi +; AVX512-NEXT: movslq %esi, %rbx +; AVX512-NEXT: movq 16(%rsp,%rbx), %rsi +; AVX512-NEXT: movq 24(%rsp,%rbx), %rbp +; AVX512-NEXT: movq %rbp, %rax +; AVX512-NEXT: shldq %cl, %rsi, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 48(%rsp,%rbx), %r8 +; AVX512-NEXT: movq 56(%rsp,%rbx), %r13 +; AVX512-NEXT: shldq %cl, %r8, %r13 +; AVX512-NEXT: movq 32(%rsp,%rbx), %r9 +; AVX512-NEXT: movq 40(%rsp,%rbx), %r14 +; AVX512-NEXT: movq %r14, %r10 +; AVX512-NEXT: shldq %cl, %r9, %r10 +; AVX512-NEXT: movq 8(%rsp,%rbx), %r11 +; AVX512-NEXT: shldq %cl, %r11, %rsi +; AVX512-NEXT: shldq %cl, %r14, %r8 +; AVX512-NEXT: movq 16(%rdi), %r12 +; AVX512-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 48(%rdi), %r14 +; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: andq %r8, %r14 +; AVX512-NEXT: andq %rsi, %r12 +; AVX512-NEXT: orq %r14, %r12 +; AVX512-NEXT: movq 56(%rdi), %r15 +; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: andq %r13, %r15 +; AVX512-NEXT: movq 24(%rdi), %r14 +; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: andq %rax, %r14 +; AVX512-NEXT: orq %r15, %r14 +; AVX512-NEXT: shldq %cl, %rbp, %r9 +; AVX512-NEXT: movq (%rsp,%rbx), %rdx +; AVX512-NEXT: movq 32(%rdi), %r15 +; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: andq %r9, %r15 +; AVX512-NEXT: shlxq %rcx, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq (%rdi), %rbx +; AVX512-NEXT: movq %rbx, %rbp +; AVX512-NEXT: andq %rax, %rbp +; AVX512-NEXT: orq %r15, %rbp +; AVX512-NEXT: orq %r12, %rbp +; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX512-NEXT: shldq %cl, %rdx, %r11 +; AVX512-NEXT: movq 40(%rdi), %rax +; AVX512-NEXT: movq %rax, %rcx +; AVX512-NEXT: andq %r10, %rcx +; AVX512-NEXT: movq 8(%rdi), %r15 +; AVX512-NEXT: movq %r15, %r12 +; AVX512-NEXT: andq %r11, %r12 +; AVX512-NEXT: orq %rcx, %r12 +; AVX512-NEXT: orq %r14, %r12 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; AVX512-NEXT: orq %rax, %r10 +; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; AVX512-NEXT: orq %r15, %r11 +; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; AVX512-NEXT: orq %rbp, %r12 +; AVX512-NEXT: movq %r8, 48(%rdi) +; AVX512-NEXT: movq %r13, 56(%rdi) +; AVX512-NEXT: movq %r9, 32(%rdi) +; AVX512-NEXT: movq %r10, 40(%rdi) +; AVX512-NEXT: movq %rsi, 16(%rdi) +; AVX512-NEXT: movq %rcx, 24(%rdi) +; AVX512-NEXT: movq %rbx, (%rdi) +; AVX512-NEXT: movq %r11, 8(%rdi) +; AVX512-NEXT: setne %al +; AVX512-NEXT: addq $72, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %rem = and i32 %position, 511 %ofs = zext nneg i32 %rem to i512 %bit = shl nuw i512 1, %ofs @@ -976,14 +3383,13 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $352, %esp # imm = 0x160 +; X86-NEXT: subl $432, %esp # imm = 0x1B0 ; X86-NEXT: movl 12(%ebp), %ecx ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: shrl $3, %edx ; X86-NEXT: andl $60, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: subl %edx, %eax +; X86-NEXT: leal {{[0-9]+}}(%esp), %esi +; X86-NEXT: subl %edx, %esi ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -1016,58 +3422,60 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 56(%eax), %esi -; X86-NEXT: movl 60(%eax), %ebx -; X86-NEXT: movl 52(%eax), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%eax), %edi -; X86-NEXT: movl 44(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%eax), %eax +; X86-NEXT: movl 56(%esi), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl 16(%ebp), %eax -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: andl $31, %ecx -; X86-NEXT: shldl %cl, %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %eax +; X86-NEXT: movl 60(%esi), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %edi +; X86-NEXT: movl 52(%esi), %eax +; X86-NEXT: movl 48(%esi), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 40(%esi), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 44(%esi), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 36(%esi), %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 32(%esi), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 28(%esi), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 24(%esi), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 20(%esi), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 16(%esi), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 12(%esi), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%esi), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%esi), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 4(%esi), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl 16(%ebp), %ebx +; X86-NEXT: movzbl %bl, %esi +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: leal {{[0-9]+}}(%esp), %esi +; X86-NEXT: subl %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl $31, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: shldl %cl, %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl %cl, %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: shldl %cl, %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -1092,12 +3500,9 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: shldl %cl, %eax, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: shldl %cl, %edi, %edx ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -1129,148 +3534,273 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 56(%eax), %esi -; X86-NEXT: movl 60(%eax), %edi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: andl 60(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 52(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 56(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 48(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 52(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 44(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 48(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 40(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 44(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 36(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 40(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 32(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 36(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 28(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 32(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 24(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 28(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 20(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 24(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 16(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 20(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 12(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 16(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 8(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 12(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 4(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 8(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: notl %esi -; X86-NEXT: movl (%eax), %eax -; X86-NEXT: shldl %cl, %eax, %edi -; X86-NEXT: andl 4(%edx), %esi -; X86-NEXT: orl %edi, %esi -; X86-NEXT: movl %esi, %edi +; X86-NEXT: movl 8(%ebp), %ebx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 48(%ebx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %esi, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: notl %esi -; X86-NEXT: shll %cl, %eax -; X86-NEXT: andl (%edx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl (%edx,%eax), %eax -; X86-NEXT: btl %ecx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 60(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 56(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 52(%edx) +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 16(%ebx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %eax, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 40(%ebx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %eax, %edx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 48(%edx) +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%ebx), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edx, %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 56(%ebx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %esi, %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 24(%ebx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %esi, %edx +; X86-NEXT: orl %edi, %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 44(%edx) +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 52(%ebx), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edx, %eax +; X86-NEXT: movl %eax, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 40(%edx) +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 20(%ebx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %esi, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 36(%edx) +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 44(%ebx), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 12(%ebx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %esi, %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 60(%ebx), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 28(%ebx), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %eax, %edx +; X86-NEXT: orl %esi, %edx +; X86-NEXT: orl %edi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 32(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 32(%ebx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %esi, %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%ebx), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edi, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 28(%edx) +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 36(%ebx), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 4(%ebx), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edx, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl 56(%edi), %ebx +; X86-NEXT: movl 60(%edi), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shldl %cl, %ebx, %eax +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 52(%edi), %eax +; X86-NEXT: shldl %cl, %eax, %ebx +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 48(%edi), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 24(%edx) +; X86-NEXT: notl %eax +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, %edx +; X86-NEXT: movl 40(%edi), %ebx +; X86-NEXT: movl 44(%edi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %ebx, %eax +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 36(%edi), %eax +; X86-NEXT: shldl %cl, %eax, %ebx +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 32(%edi), %ebx +; X86-NEXT: shldl %cl, %ebx, %eax +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 28(%edi), %eax +; X86-NEXT: shldl %cl, %eax, %ebx +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 24(%edi), %ebx +; X86-NEXT: shldl %cl, %ebx, %eax +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 20(%edi), %eax +; X86-NEXT: shldl %cl, %eax, %ebx +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 16(%edi), %ebx +; X86-NEXT: shldl %cl, %ebx, %eax +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 12(%edi), %eax +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shldl %cl, %eax, %ebx +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: notl %esi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl 8(%edi), %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: orl %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 20(%edx) +; X86-NEXT: notl %eax +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl 4(%edi), %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shldl %cl, %ebx, %edx +; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 16(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl (%edi), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %ebx +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: notl %edi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %eax +; X86-NEXT: orl %eax, %edi +; X86-NEXT: movl %edi, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 12(%edx) -; X86-NEXT: movl %ebx, 8(%edx) -; X86-NEXT: movl %edi, 4(%edx) -; X86-NEXT: movl %esi, (%edx) -; X86-NEXT: setae %al +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 60(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 56(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 52(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 44(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 40(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 36(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 32(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 28(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 24(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 20(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 16(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl %esi, 48(%eax) +; X86-NEXT: sete %al ; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -1286,8 +3816,7 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; SSE-NEXT: pushq %r13 ; SSE-NEXT: pushq %r12 ; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $184, %rsp -; SSE-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: subq $216, %rsp ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) ; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) @@ -1300,103 +3829,139 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp) ; SSE-NEXT: movl %esi, %ecx ; SSE-NEXT: andl $63, %ecx -; SSE-NEXT: movl %esi, %eax -; SSE-NEXT: shrl $3, %eax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: # kill: def $eax killed $eax killed $rax -; SSE-NEXT: andl $56, %eax -; SSE-NEXT: negl %eax -; SSE-NEXT: movslq %eax, %r12 -; SSE-NEXT: movq 160(%rsp,%r12), %rax -; SSE-NEXT: movq 168(%rsp,%r12), %r10 -; SSE-NEXT: shldq %cl, %rax, %r10 -; SSE-NEXT: movq 152(%rsp,%r12), %rsi -; SSE-NEXT: shldq %cl, %rsi, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 144(%rsp,%r12), %r11 -; SSE-NEXT: shldq %cl, %r11, %rsi -; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 136(%rsp,%r12), %rbx -; SSE-NEXT: shldq %cl, %rbx, %r11 -; SSE-NEXT: movq 128(%rsp,%r12), %r14 -; SSE-NEXT: shldq %cl, %r14, %rbx -; SSE-NEXT: movq 120(%rsp,%r12), %r15 -; SSE-NEXT: shldq %cl, %r15, %r14 -; SSE-NEXT: movq 112(%rsp,%r12), %r13 -; SSE-NEXT: shldq %cl, %r13, %r15 -; SSE-NEXT: shlq %cl, %r13 -; SSE-NEXT: movl %edx, %eax +; SSE-NEXT: shrl $3, %esi +; SSE-NEXT: andl $56, %esi +; SSE-NEXT: negl %esi +; SSE-NEXT: movslq %esi, %r10 +; SSE-NEXT: movq 184(%rsp,%r10), %r11 +; SSE-NEXT: movq 192(%rsp,%r10), %rsi +; SSE-NEXT: movq %rsi, %r13 +; SSE-NEXT: shldq %cl, %r11, %r13 +; SSE-NEXT: movq 200(%rsp,%r10), %r15 +; SSE-NEXT: shldq %cl, %rsi, %r15 +; SSE-NEXT: movq 168(%rsp,%r10), %rbx +; SSE-NEXT: movq 176(%rsp,%r10), %rsi +; SSE-NEXT: movq %rsi, %r14 +; SSE-NEXT: shldq %cl, %rbx, %r14 +; SSE-NEXT: shldq %cl, %rsi, %r11 +; SSE-NEXT: movq 152(%rsp,%r10), %rax +; SSE-NEXT: movq 160(%rsp,%r10), %r8 +; SSE-NEXT: movq %r8, %r12 +; SSE-NEXT: shldq %cl, %rax, %r12 +; SSE-NEXT: shldq %cl, %r8, %rbx +; SSE-NEXT: movq 144(%rsp,%r10), %r9 +; SSE-NEXT: movq %r9, %r8 +; SSE-NEXT: shlq %cl, %r8 +; SSE-NEXT: shldq %cl, %r9, %rax +; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movl %edx, %edx ; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) ; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, (%rsp) ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rdx, {{[0-9]+}}(%rsp) ; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq 32(%rsp,%r12), %rax -; SSE-NEXT: movq 40(%rsp,%r12), %rdx -; SSE-NEXT: shldq %cl, %rax, %rdx +; SSE-NEXT: movq 16(%rdi), %rdx ; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 24(%rsp,%r12), %rdx -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq 16(%rsp,%r12), %rsi -; SSE-NEXT: shldq %cl, %rsi, %rdx -; SSE-NEXT: movq 8(%rsp,%r12), %r8 -; SSE-NEXT: shldq %cl, %r8, %rsi -; SSE-NEXT: movq (%rsp,%r12), %rbp -; SSE-NEXT: shldq %cl, %rbp, %r8 -; SSE-NEXT: movq -8(%rsp,%r12), %r9 -; SSE-NEXT: shldq %cl, %r9, %rbp -; SSE-NEXT: notq %r10 -; SSE-NEXT: andq 56(%rdi), %r10 -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload -; SSE-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE-NEXT: notq %r10 -; SSE-NEXT: andq 48(%rdi), %r10 -; SSE-NEXT: orq %rax, %r10 +; SSE-NEXT: movq 48(%rdi), %rsi +; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq %rsi, %r13 +; SSE-NEXT: andq %rdx, %r12 +; SSE-NEXT: orq %r13, %r12 +; SSE-NEXT: movq %r15, %rsi +; SSE-NEXT: movq 56(%rdi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq %rdx, %r15 +; SSE-NEXT: movq %rbx, %r13 +; SSE-NEXT: movq 24(%rdi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq %rdx, %rbx +; SSE-NEXT: orq %r15, %rbx +; SSE-NEXT: movq %r14, %rbp +; SSE-NEXT: movq 32(%rdi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq %rdx, %r14 +; SSE-NEXT: movq %r8, %r15 +; SSE-NEXT: movq (%rdi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq %rdx, %r8 +; SSE-NEXT: orq %r14, %r8 +; SSE-NEXT: orq %r12, %r8 +; SSE-NEXT: movq %r11, %r12 +; SSE-NEXT: movq 40(%rdi), %r9 +; SSE-NEXT: andq %r9, %r11 +; SSE-NEXT: movq %rax, %r14 +; SSE-NEXT: movq 8(%rdi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq %rdx, %rax +; SSE-NEXT: orq %r11, %rax +; SSE-NEXT: orq %rbx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE-NEXT: notq %rax -; SSE-NEXT: andq 40(%rdi), %rax -; SSE-NEXT: orq %rdx, %rax +; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; SSE-NEXT: movq %rax, %rdx +; SSE-NEXT: movq 56(%rsp,%r10), %r11 +; SSE-NEXT: movq 64(%rsp,%r10), %rax +; SSE-NEXT: movq %rax, %rbx +; SSE-NEXT: shldq %cl, %r11, %rbx +; SSE-NEXT: orq %rbx, %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: notq %rsi +; SSE-NEXT: movq 72(%rsp,%r10), %rbx +; SSE-NEXT: shldq %cl, %rax, %rbx +; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; SSE-NEXT: orq %rbx, %rsi +; SSE-NEXT: notq %rbp +; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; SSE-NEXT: movq 40(%rsp,%r10), %rax +; SSE-NEXT: movq 48(%rsp,%r10), %rdx +; SSE-NEXT: movq %rdx, %rbx +; SSE-NEXT: shldq %cl, %rax, %rbx +; SSE-NEXT: orq %rbx, %rbp +; SSE-NEXT: notq %r12 +; SSE-NEXT: andq %r9, %r12 +; SSE-NEXT: shldq %cl, %rdx, %r11 +; SSE-NEXT: movq 24(%rsp,%r10), %r9 +; SSE-NEXT: movq 32(%rsp,%r10), %rdx +; SSE-NEXT: movq %rdx, %rbx +; SSE-NEXT: shldq %cl, %r9, %rbx +; SSE-NEXT: orq %r11, %r12 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload ; SSE-NEXT: notq %r11 -; SSE-NEXT: andq 32(%rdi), %r11 -; SSE-NEXT: orq %rsi, %r11 -; SSE-NEXT: notq %rbx -; SSE-NEXT: andq 24(%rdi), %rbx -; SSE-NEXT: orq %r8, %rbx -; SSE-NEXT: notq %r14 -; SSE-NEXT: andq 16(%rdi), %r14 -; SSE-NEXT: orq %rbp, %r14 -; SSE-NEXT: notq %r15 -; SSE-NEXT: movq -16(%rsp,%r12), %rax -; SSE-NEXT: shldq %cl, %rax, %r9 -; SSE-NEXT: andq 8(%rdi), %r15 -; SSE-NEXT: orq %r9, %r15 +; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: orq %rbx, %r11 ; SSE-NEXT: notq %r13 -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shlq %cl, %rax -; SSE-NEXT: andq (%rdi), %r13 +; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload ; SSE-NEXT: orq %rax, %r13 +; SSE-NEXT: notq %r15 +; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload +; SSE-NEXT: movq 16(%rsp,%r10), %rax +; SSE-NEXT: movq %rax, %rdx +; SSE-NEXT: shlq %cl, %rdx +; SSE-NEXT: orq %rdx, %r15 +; SSE-NEXT: notq %r14 +; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: shldq %cl, %rax, %r9 +; SSE-NEXT: orq %r9, %r14 +; SSE-NEXT: orq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andl $60, %eax -; SSE-NEXT: movl (%rdi,%rax), %eax -; SSE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; SSE-NEXT: btl %ecx, %eax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: movq %rax, 56(%rdi) -; SSE-NEXT: movq %r10, 48(%rdi) -; SSE-NEXT: movq %rdx, 40(%rdi) -; SSE-NEXT: movq %r11, 32(%rdi) -; SSE-NEXT: movq %rbx, 24(%rdi) -; SSE-NEXT: movq %r14, 16(%rdi) -; SSE-NEXT: movq %r15, 8(%rdi) -; SSE-NEXT: movq %r13, (%rdi) -; SSE-NEXT: setae %al -; SSE-NEXT: addq $184, %rsp +; SSE-NEXT: movq %rax, 48(%rdi) +; SSE-NEXT: movq %rsi, 56(%rdi) +; SSE-NEXT: movq %rbp, 32(%rdi) +; SSE-NEXT: movq %r12, 40(%rdi) +; SSE-NEXT: movq %r11, 16(%rdi) +; SSE-NEXT: movq %r13, 24(%rdi) +; SSE-NEXT: movq %r15, (%rdi) +; SSE-NEXT: movq %r14, 8(%rdi) +; SSE-NEXT: sete %al +; SSE-NEXT: addq $216, %rsp ; SSE-NEXT: popq %rbx ; SSE-NEXT: popq %r12 ; SSE-NEXT: popq %r13 @@ -1413,103 +3978,132 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; AVX2-NEXT: pushq %r13 ; AVX2-NEXT: pushq %r12 ; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $168, %rsp +; AVX2-NEXT: subq $200, %rsp ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1,0,0,0] ; AVX2-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: movl %esi, %r11d -; AVX2-NEXT: shrl $3, %r11d -; AVX2-NEXT: movl %r11d, %eax -; AVX2-NEXT: andl $56, %eax -; AVX2-NEXT: negl %eax -; AVX2-NEXT: movslq %eax, %r10 -; AVX2-NEXT: movq 104(%rsp,%r10), %r15 -; AVX2-NEXT: movq 112(%rsp,%r10), %rax -; AVX2-NEXT: movq %rax, %rsi -; AVX2-NEXT: shldq %cl, %r15, %rsi -; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 120(%rsp,%r10), %rsi -; AVX2-NEXT: movq %rsi, %r8 -; AVX2-NEXT: shldq %cl, %rax, %r8 -; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 128(%rsp,%r10), %rax -; AVX2-NEXT: movq %rax, %rbx -; AVX2-NEXT: shldq %cl, %rsi, %rbx -; AVX2-NEXT: movq 136(%rsp,%r10), %rsi -; AVX2-NEXT: movq %rsi, %r14 -; AVX2-NEXT: shldq %cl, %rax, %r14 -; AVX2-NEXT: movq 144(%rsp,%r10), %rax -; AVX2-NEXT: movq %rax, %r12 -; AVX2-NEXT: shldq %cl, %rsi, %r12 -; AVX2-NEXT: movq 96(%rsp,%r10), %rsi -; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 152(%rsp,%r10), %r13 -; AVX2-NEXT: shldq %cl, %rax, %r13 -; AVX2-NEXT: shldq %cl, %rsi, %r15 -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: movl %esi, %r8d +; AVX2-NEXT: andl $63, %r8d +; AVX2-NEXT: shrl $3, %esi +; AVX2-NEXT: andl $56, %esi +; AVX2-NEXT: negl %esi +; AVX2-NEXT: movslq %esi, %rsi +; AVX2-NEXT: movq 144(%rsp,%rsi), %r11 +; AVX2-NEXT: movq 152(%rsp,%rsi), %r12 +; AVX2-NEXT: movq %r12, %r10 +; AVX2-NEXT: movl %r8d, %ecx +; AVX2-NEXT: shldq %cl, %r11, %r10 +; AVX2-NEXT: movq 176(%rsp,%rsi), %r14 +; AVX2-NEXT: movq 184(%rsp,%rsi), %r9 +; AVX2-NEXT: shldq %cl, %r14, %r9 +; AVX2-NEXT: movq 160(%rsp,%rsi), %r15 +; AVX2-NEXT: movq 168(%rsp,%rsi), %r13 +; AVX2-NEXT: movq %r13, %rbx +; AVX2-NEXT: shldq %cl, %r15, %rbx +; AVX2-NEXT: movq 128(%rsp,%rsi), %rbp +; AVX2-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 136(%rsp,%rsi), %rax +; AVX2-NEXT: shldq %cl, %rax, %r11 +; AVX2-NEXT: shldq %cl, %r13, %r14 +; AVX2-NEXT: shldq %cl, %r12, %r15 +; AVX2-NEXT: shldq %cl, %rbp, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movl %edx, %edx ; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %rdx, (%rsp) ; AVX2-NEXT: movq $0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 16(%rsp,%r10), %rbp -; AVX2-NEXT: movq 24(%rsp,%r10), %r9 -; AVX2-NEXT: shldq %cl, %rbp, %r9 -; AVX2-NEXT: movq 8(%rsp,%r10), %rdx -; AVX2-NEXT: shldq %cl, %rdx, %rbp -; AVX2-NEXT: movq (%rsp,%r10), %rax -; AVX2-NEXT: shldq %cl, %rax, %rdx -; AVX2-NEXT: movq -8(%rsp,%r10), %r8 -; AVX2-NEXT: shldq %cl, %r8, %rax -; AVX2-NEXT: movq -16(%rsp,%r10), %rsi -; AVX2-NEXT: shldq %cl, %rsi, %r8 -; AVX2-NEXT: andnq 56(%rdi), %r13, %r13 -; AVX2-NEXT: orq %r9, %r13 -; AVX2-NEXT: movq -24(%rsp,%r10), %r9 -; AVX2-NEXT: shldq %cl, %r9, %rsi -; AVX2-NEXT: andnq 48(%rdi), %r12, %r12 -; AVX2-NEXT: andnq 40(%rdi), %r14, %r14 +; AVX2-NEXT: movq 16(%rdi), %r12 +; AVX2-NEXT: movq 48(%rdi), %rbp +; AVX2-NEXT: movq 32(%rdi), %r13 +; AVX2-NEXT: andnq %r13, %r15, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: andq %r15, %r13 +; AVX2-NEXT: andnq %rbp, %r14, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: andq %r14, %rbp +; AVX2-NEXT: andnq %r12, %r11, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: andq %r11, %r12 +; AVX2-NEXT: movq 40(%rdi), %rax ; AVX2-NEXT: orq %rbp, %r12 -; AVX2-NEXT: orq %rdx, %r14 -; AVX2-NEXT: andnq 32(%rdi), %rbx, %rdx -; AVX2-NEXT: orq %rax, %rdx -; AVX2-NEXT: shlxq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; AVX2-NEXT: movq -32(%rsp,%r10), %r10 -; AVX2-NEXT: shlxq %rcx, %r10, %rbx -; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX2-NEXT: shldq %cl, %r10, %r9 +; AVX2-NEXT: andnq %rax, %rbx, %rcx +; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq %rax, %rbp +; AVX2-NEXT: andq %rbx, %rbp +; AVX2-NEXT: movq 56(%rdi), %rcx +; AVX2-NEXT: andnq %rcx, %r9, %rbx +; AVX2-NEXT: andq %r9, %rcx +; AVX2-NEXT: movq 24(%rdi), %rax +; AVX2-NEXT: andnq %rax, %r10, %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: andq %r10, %rax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: shlxq %r8, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; AVX2-NEXT: movq (%rdi), %r10 +; AVX2-NEXT: andnq %r10, %rcx, %r15 +; AVX2-NEXT: andq %rcx, %r10 +; AVX2-NEXT: movq 40(%rsp,%rsi), %rdx +; AVX2-NEXT: movq 48(%rsp,%rsi), %r11 +; AVX2-NEXT: movq %r11, %r9 +; AVX2-NEXT: movl %r8d, %ecx +; AVX2-NEXT: shldq %cl, %rdx, %r9 +; AVX2-NEXT: orq %r13, %r10 +; AVX2-NEXT: orq %r12, %r10 +; AVX2-NEXT: movq 8(%rdi), %r13 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: andnq 24(%rdi), %rcx, %rcx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andnq 16(%rdi), %r10, %r10 -; AVX2-NEXT: orq %r8, %rcx -; AVX2-NEXT: orq %rsi, %r10 -; AVX2-NEXT: andnq 8(%rdi), %r15, %rsi -; AVX2-NEXT: orq %r9, %rsi -; AVX2-NEXT: andnq (%rdi), %rax, %rax -; AVX2-NEXT: orq %rbx, %rax -; AVX2-NEXT: andl $60, %r11d -; AVX2-NEXT: movl (%rdi,%r11), %r8d -; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload -; AVX2-NEXT: btl %r9d, %r8d -; AVX2-NEXT: movq %r13, 56(%rdi) -; AVX2-NEXT: movq %r12, 48(%rdi) -; AVX2-NEXT: movq %r14, 40(%rdi) -; AVX2-NEXT: movq %rdx, 32(%rdi) -; AVX2-NEXT: movq %rcx, 24(%rdi) -; AVX2-NEXT: movq %r10, 16(%rdi) -; AVX2-NEXT: movq %rsi, 8(%rdi) -; AVX2-NEXT: movq %rax, (%rdi) -; AVX2-NEXT: setae %al -; AVX2-NEXT: addq $168, %rsp +; AVX2-NEXT: andnq %r13, %rcx, %r12 +; AVX2-NEXT: andq %rcx, %r13 +; AVX2-NEXT: orq %rbp, %r13 +; AVX2-NEXT: orq %rax, %r13 +; AVX2-NEXT: movq 56(%rsp,%rsi), %rax +; AVX2-NEXT: movl %r8d, %ecx +; AVX2-NEXT: shldq %cl, %r11, %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX2-NEXT: orq %r9, %r14 +; AVX2-NEXT: orq %rax, %rbx +; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 24(%rsp,%rsi), %rax +; AVX2-NEXT: movq 32(%rsp,%rsi), %r9 +; AVX2-NEXT: movq %r9, %r11 +; AVX2-NEXT: shldq %cl, %rax, %r11 +; AVX2-NEXT: shldq %cl, %r9, %rdx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; AVX2-NEXT: orq %r11, %rbp +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX2-NEXT: orq %rdx, %rbx +; AVX2-NEXT: movq 8(%rsp,%rsi), %rdx +; AVX2-NEXT: movq 16(%rsp,%rsi), %r9 +; AVX2-NEXT: movq %r9, %r11 +; AVX2-NEXT: shldq %cl, %rdx, %r11 +; AVX2-NEXT: shldq %cl, %r9, %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; AVX2-NEXT: orq %r11, %r9 +; AVX2-NEXT: movq (%rsp,%rsi), %rsi +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; AVX2-NEXT: orq %rax, %r11 +; AVX2-NEXT: shlxq %r8, %rsi, %rax +; AVX2-NEXT: shldq %cl, %rsi, %rdx +; AVX2-NEXT: orq %rax, %r15 +; AVX2-NEXT: orq %rdx, %r12 +; AVX2-NEXT: orq %r10, %r13 +; AVX2-NEXT: movq %r14, 48(%rdi) +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: movq %rax, 56(%rdi) +; AVX2-NEXT: movq %rbp, 32(%rdi) +; AVX2-NEXT: movq %rbx, 40(%rdi) +; AVX2-NEXT: movq %r9, 16(%rdi) +; AVX2-NEXT: movq %r11, 24(%rdi) +; AVX2-NEXT: movq %r15, (%rdi) +; AVX2-NEXT: movq %r12, 8(%rdi) +; AVX2-NEXT: sete %al +; AVX2-NEXT: addq $200, %rsp ; AVX2-NEXT: popq %rbx ; AVX2-NEXT: popq %r12 ; AVX2-NEXT: popq %r13 @@ -1527,100 +4121,131 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; AVX512-NEXT: pushq %r13 ; AVX512-NEXT: pushq %r12 ; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $152, %rsp +; AVX512-NEXT: subq $184, %rsp ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1,0,0,0] ; AVX512-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512-NEXT: movl %esi, %ecx ; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: movl %esi, %r8d -; AVX512-NEXT: shrl $3, %r8d -; AVX512-NEXT: movl %r8d, %eax -; AVX512-NEXT: andl $56, %eax -; AVX512-NEXT: negl %eax -; AVX512-NEXT: movslq %eax, %r9 -; AVX512-NEXT: movq 88(%rsp,%r9), %r10 -; AVX512-NEXT: movq 96(%rsp,%r9), %rax -; AVX512-NEXT: movq %rax, %rsi -; AVX512-NEXT: shldq %cl, %r10, %rsi -; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 104(%rsp,%r9), %rsi -; AVX512-NEXT: movq %rsi, %r11 -; AVX512-NEXT: shldq %cl, %rax, %r11 -; AVX512-NEXT: movq 112(%rsp,%r9), %rax -; AVX512-NEXT: movq %rax, %rbx -; AVX512-NEXT: shldq %cl, %rsi, %rbx -; AVX512-NEXT: movq 120(%rsp,%r9), %rsi -; AVX512-NEXT: movq %rsi, %r14 -; AVX512-NEXT: shldq %cl, %rax, %r14 -; AVX512-NEXT: movq 128(%rsp,%r9), %rax -; AVX512-NEXT: movq %rax, %r12 -; AVX512-NEXT: shldq %cl, %rsi, %r12 -; AVX512-NEXT: movq 136(%rsp,%r9), %r13 -; AVX512-NEXT: shldq %cl, %rax, %r13 -; AVX512-NEXT: movq 80(%rsp,%r9), %r15 -; AVX512-NEXT: shldq %cl, %r15, %r10 -; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: shrl $3, %esi +; AVX512-NEXT: andl $56, %esi +; AVX512-NEXT: negl %esi +; AVX512-NEXT: movslq %esi, %rsi +; AVX512-NEXT: movq 128(%rsp,%rsi), %r10 +; AVX512-NEXT: movq 136(%rsp,%rsi), %r12 +; AVX512-NEXT: movq %r12, %rax +; AVX512-NEXT: shldq %cl, %r10, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 160(%rsp,%rsi), %r14 +; AVX512-NEXT: movq 168(%rsp,%rsi), %rax +; AVX512-NEXT: shldq %cl, %r14, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 144(%rsp,%rsi), %r15 +; AVX512-NEXT: movq 152(%rsp,%rsi), %r11 +; AVX512-NEXT: movq %r11, %rbx +; AVX512-NEXT: shldq %cl, %r15, %rbx +; AVX512-NEXT: movq 120(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rax, %r10 +; AVX512-NEXT: shldq %cl, %r11, %r14 +; AVX512-NEXT: movq %rdi, %r9 +; AVX512-NEXT: movq 112(%rsp,%rsi), %r11 +; AVX512-NEXT: shldq %cl, %r12, %r15 +; AVX512-NEXT: movl %edx, %edx ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq (%rsp,%r9), %rbp -; AVX512-NEXT: movq 8(%rsp,%r9), %rsi -; AVX512-NEXT: shldq %cl, %rbp, %rsi -; AVX512-NEXT: movq -8(%rsp,%r9), %rdx -; AVX512-NEXT: shldq %cl, %rdx, %rbp -; AVX512-NEXT: movq -16(%rsp,%r9), %rax -; AVX512-NEXT: shldq %cl, %rax, %rdx -; AVX512-NEXT: andnq 56(%rdi), %r13, %r13 -; AVX512-NEXT: andnq 48(%rdi), %r12, %r12 -; AVX512-NEXT: orq %rsi, %r13 -; AVX512-NEXT: orq %rbp, %r12 -; AVX512-NEXT: andnq 40(%rdi), %r14, %r14 -; AVX512-NEXT: orq %rdx, %r14 -; AVX512-NEXT: movq -24(%rsp,%r9), %rsi -; AVX512-NEXT: shldq %cl, %rsi, %rax -; AVX512-NEXT: andnq 32(%rdi), %rbx, %rdx -; AVX512-NEXT: orq %rax, %rdx -; AVX512-NEXT: movq -32(%rsp,%r9), %rax -; AVX512-NEXT: shldq %cl, %rax, %rsi -; AVX512-NEXT: shlxq %rcx, %r15, %rbx -; AVX512-NEXT: andnq 24(%rdi), %r11, %r11 -; AVX512-NEXT: orq %rsi, %r11 -; AVX512-NEXT: movq -48(%rsp,%r9), %rsi -; AVX512-NEXT: movq -40(%rsp,%r9), %r9 -; AVX512-NEXT: shldq %cl, %r9, %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: andnq 16(%rdi), %r15, %r15 +; AVX512-NEXT: movq 16(%rdi), %r12 +; AVX512-NEXT: movq 48(%rdi), %r13 +; AVX512-NEXT: movq 32(%rdi), %rbp +; AVX512-NEXT: andnq %rbp, %r15, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: andq %r15, %rbp +; AVX512-NEXT: andnq %r13, %r14, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: andq %r14, %r13 +; AVX512-NEXT: andnq %r12, %r10, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: andq %r10, %r12 +; AVX512-NEXT: movq 40(%rdi), %r8 +; AVX512-NEXT: orq %r13, %r12 +; AVX512-NEXT: andnq %r8, %rbx, %rdi +; AVX512-NEXT: andq %rbx, %r8 +; AVX512-NEXT: movq 56(%r9), %r13 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX512-NEXT: andnq %r13, %rdx, %r10 +; AVX512-NEXT: andq %rdx, %r13 +; AVX512-NEXT: movq 24(%r9), %rax +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX512-NEXT: andnq %rax, %rdx, %r15 +; AVX512-NEXT: andq %rdx, %rax +; AVX512-NEXT: orq %r13, %rax +; AVX512-NEXT: shlxq %rcx, %r11, %r13 +; AVX512-NEXT: movq (%r9), %rdx +; AVX512-NEXT: andnq %rdx, %r13, %r14 +; AVX512-NEXT: andq %r13, %rdx +; AVX512-NEXT: orq %rbp, %rdx +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r11, %rbp +; AVX512-NEXT: orq %r12, %rdx +; AVX512-NEXT: movq 8(%r9), %r13 +; AVX512-NEXT: andnq %r13, %rbp, %rbx +; AVX512-NEXT: andq %rbp, %r13 +; AVX512-NEXT: orq %r8, %r13 +; AVX512-NEXT: movq 24(%rsp,%rsi), %r8 +; AVX512-NEXT: orq %rax, %r13 +; AVX512-NEXT: movq 32(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, %r12 +; AVX512-NEXT: shldq %cl, %r8, %r12 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; AVX512-NEXT: orq %r12, %r11 +; AVX512-NEXT: movq 40(%rsp,%rsi), %r12 +; AVX512-NEXT: shldq %cl, %rax, %r12 +; AVX512-NEXT: orq %r12, %r10 +; AVX512-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 8(%rsp,%rsi), %rax +; AVX512-NEXT: movq 16(%rsp,%rsi), %r12 +; AVX512-NEXT: movq %r12, %rbp +; AVX512-NEXT: shldq %cl, %rax, %rbp +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX512-NEXT: orq %rbp, %r10 +; AVX512-NEXT: shldq %cl, %r12, %r8 +; AVX512-NEXT: orq %r8, %rdi +; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq -8(%rsp,%rsi), %r8 +; AVX512-NEXT: movq (%rsp,%rsi), %r12 +; AVX512-NEXT: movq %r12, %rbp +; AVX512-NEXT: shldq %cl, %r8, %rbp +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; AVX512-NEXT: orq %rbp, %rdi +; AVX512-NEXT: movq -16(%rsp,%rsi), %rsi +; AVX512-NEXT: shldq %cl, %r12, %rax ; AVX512-NEXT: orq %rax, %r15 ; AVX512-NEXT: shlxq %rcx, %rsi, %rax ; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512-NEXT: shldq %cl, %rsi, %r9 -; AVX512-NEXT: andnq 8(%rdi), %r10, %rcx -; AVX512-NEXT: orq %r9, %rcx -; AVX512-NEXT: andnq (%rdi), %rbx, %rsi -; AVX512-NEXT: orq %rax, %rsi -; AVX512-NEXT: andl $60, %r8d -; AVX512-NEXT: movl (%rdi,%r8), %eax -; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload -; AVX512-NEXT: btl %r8d, %eax -; AVX512-NEXT: movq %r13, 56(%rdi) -; AVX512-NEXT: movq %r12, 48(%rdi) -; AVX512-NEXT: movq %r14, 40(%rdi) -; AVX512-NEXT: movq %rdx, 32(%rdi) -; AVX512-NEXT: movq %r11, 24(%rdi) -; AVX512-NEXT: movq %r15, 16(%rdi) -; AVX512-NEXT: movq %rcx, 8(%rdi) -; AVX512-NEXT: movq %rsi, (%rdi) -; AVX512-NEXT: setae %al -; AVX512-NEXT: addq $152, %rsp +; AVX512-NEXT: shldq %cl, %rsi, %r8 +; AVX512-NEXT: orq %rax, %r14 +; AVX512-NEXT: orq %r8, %rbx +; AVX512-NEXT: orq %rdx, %r13 +; AVX512-NEXT: movq %r11, 48(%r9) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq %rax, 56(%r9) +; AVX512-NEXT: movq %r10, 32(%r9) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq %rax, 40(%r9) +; AVX512-NEXT: movq %rdi, 16(%r9) +; AVX512-NEXT: movq %r15, 24(%r9) +; AVX512-NEXT: movq %r14, (%r9) +; AVX512-NEXT: movq %rbx, 8(%r9) +; AVX512-NEXT: sete %al +; AVX512-NEXT: addq $184, %rsp ; AVX512-NEXT: popq %rbx ; AVX512-NEXT: popq %r12 ; AVX512-NEXT: popq %r13 @@ -1649,25 +4274,2749 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind { ; X86-LABEL: test_ne_i4096: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: andl $4064, %edx # imm = 0xFE0 -; X86-NEXT: shrl $3, %edx -; X86-NEXT: movl (%eax,%edx), %eax -; X86-NEXT: btl %ecx, %eax -; X86-NEXT: setb %al +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $1792, %esp # imm = 0x700 +; X86-NEXT: movl 12(%ebp), %ebx +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: shrl $3, %ecx +; X86-NEXT: andl $508, %ecx # imm = 0x1FC +; X86-NEXT: leal {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: subl %ecx, %esi +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl 248(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 252(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl $31, %ebx +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 504(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 508(%esi), %edx +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 120(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 124(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 376(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 380(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 184(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 188(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 440(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 444(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 56(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 60(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 312(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 316(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 216(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 220(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 472(%esi), %edi +; X86-NEXT: movl 476(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 88(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 92(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 344(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 348(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 152(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 156(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 408(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 412(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 24(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 28(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 280(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 284(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 232(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 236(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 488(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 492(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 104(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 108(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 360(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 364(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 168(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 172(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 424(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 428(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 40(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 44(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 296(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 300(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 200(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 204(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 456(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 460(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 72(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 76(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 328(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 332(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 136(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 140(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 392(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 396(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 12(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 264(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 268(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 240(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 244(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 496(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 500(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 112(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 116(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 368(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 372(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 176(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 180(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 432(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 436(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 48(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 52(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 304(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 308(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 208(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 212(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 464(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 468(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 80(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 84(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 336(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 340(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 144(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 148(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 400(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 404(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 16(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 20(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 272(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 276(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 224(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 228(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 480(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 484(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 96(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 100(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 352(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 356(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 160(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 164(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 416(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 420(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 32(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 36(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 288(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 292(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 192(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 196(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 448(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 452(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 64(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 68(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 320(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 324(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 128(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 132(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, %edx +; X86-NEXT: movl 256(%esi), %edi +; X86-NEXT: movl 260(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl %cl, %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl 388(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl 4(%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shrdl $1, %eax, %edi +; X86-NEXT: shrl %eax +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: notb %cl +; X86-NEXT: shrdl %cl, %eax, %edi +; X86-NEXT: shrl %cl, %ebx +; X86-NEXT: movb $32, %cl +; X86-NEXT: testb %cl, %cl +; X86-NEXT: movl (%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: jne .LBB20_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: .LBB20_2: +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shll %cl, %edx +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 320(%eax), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: andl 64(%eax), %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 448(%eax), %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 192(%eax), %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 288(%eax), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 32(%eax), %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 416(%eax), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: andl 160(%eax), %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: orl %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 352(%eax), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: andl 96(%eax), %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 480(%eax), %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 224(%eax), %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: orl %edi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 272(%eax), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 16(%eax), %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 400(%eax), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: andl 144(%eax), %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 336(%eax), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 80(%eax), %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 464(%eax), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: andl 208(%eax), %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: orl %edx, %edi +; X86-NEXT: orl %esi, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 304(%eax), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 48(%eax), %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 432(%eax), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: andl 176(%eax), %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 368(%eax), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 112(%eax), %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 496(%eax), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: andl 240(%eax), %ebx +; X86-NEXT: orl %ecx, %ebx +; X86-NEXT: orl %edx, %ebx +; X86-NEXT: orl %esi, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 264(%eax), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 8(%eax), %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 392(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 136(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 328(%ebx), %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 72(%ebx), %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: andl 456(%ebx), %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: andl 200(%ebx), %esi +; X86-NEXT: orl %edi, %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: andl 296(%ebx), %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 40(%ebx), %eax +; X86-NEXT: orl %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: andl 424(%ebx), %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 168(%ebx), %edx +; X86-NEXT: orl %edi, %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 360(%ebx), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 104(%ebx), %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 488(%ebx), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: andl 232(%ebx), %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: orl %edx, %edi +; X86-NEXT: orl %esi, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 280(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 24(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 408(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 152(%ebx), %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 344(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 88(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 472(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: andl 216(%ebx), %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 312(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 56(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 440(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 184(%ebx), %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 376(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 120(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 504(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: andl 248(%ebx), %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: orl %edx, %edi +; X86-NEXT: orl %esi, %edi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 324(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 68(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 452(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 196(%ebx), %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 292(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 36(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 420(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 164(%ebx), %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 356(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 100(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 484(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: andl 228(%ebx), %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 276(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 20(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 404(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 148(%ebx), %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 340(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 84(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 468(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: andl 212(%ebx), %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 308(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 52(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 436(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 180(%ebx), %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 372(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 116(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 500(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: andl 244(%ebx), %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: orl %edx, %edi +; X86-NEXT: orl %esi, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 268(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 12(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 396(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 140(%ebx), %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 332(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 76(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 460(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: andl 204(%ebx), %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: orl %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 300(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 44(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 428(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 172(%ebx), %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 364(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 108(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 492(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: andl 236(%ebx), %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: orl %edi, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 284(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 28(%ebx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 412(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: andl 156(%ebx), %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 348(%ebx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 92(%ebx), %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 476(%ebx), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 220(%ebx), %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: orl %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 316(%ebx), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 60(%ebx), %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 444(%ebx), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: andl 188(%ebx), %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: orl %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 380(%ebx), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl 124(%ebx), %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 508(%ebx), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl 8(%ebp), %esi +; X86-NEXT: andl 252(%esi), %ebx +; X86-NEXT: orl %ecx, %ebx +; X86-NEXT: orl %edx, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: orl %eax, %ebx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: negl %ecx +; X86-NEXT: movl 1648(%esp,%ecx), %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl %cl, %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl 8(%ebp), %edx +; X86-NEXT: andl 128(%edx), %ecx +; X86-NEXT: andl 384(%edx), %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %eax +; X86-NEXT: andl (%edx), %eax +; X86-NEXT: orl %eax, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 256(%edx), %eax +; X86-NEXT: orl %eax, %edi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 260(%edx), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: andl 4(%edx), %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl 132(%edx), %eax +; X86-NEXT: andl 388(%edx), %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: orl %ebx, %esi +; X86-NEXT: orl %edi, %esi +; X86-NEXT: setne %al +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; X64-LABEL: test_ne_i4096: -; X64: # %bb.0: -; X64-NEXT: movl %esi, %eax -; X64-NEXT: andl $4064, %eax # imm = 0xFE0 -; X64-NEXT: shrl $3, %eax -; X64-NEXT: movl (%rdi,%rax), %eax -; X64-NEXT: btl %esi, %eax -; X64-NEXT: setb %al -; X64-NEXT: retq +; SSE-LABEL: test_ne_i4096: +; SSE: # %bb.0: +; SSE-NEXT: pushq %rbp +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %r13 +; SSE-NEXT: pushq %r12 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: subq $1576, %rsp # imm = 0x628 +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: movl %esi, %eax +; SSE-NEXT: andl $4032, %eax # imm = 0xFC0 +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp) +; SSE-NEXT: andl $63, %ecx +; SSE-NEXT: shrl $3, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: movslq %eax, %rsi +; SSE-NEXT: movq 1296(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1304(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1552(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1560(%rsp,%rsi), %rax +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1168(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1176(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1424(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1432(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1232(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1240(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1488(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1496(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1104(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1112(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1360(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, (%rsp) # 8-byte Spill +; SSE-NEXT: movq 1368(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1264(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1272(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1520(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1528(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1136(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1144(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1392(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1400(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1200(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1208(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1456(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1464(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1072(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1080(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1328(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1336(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1280(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1288(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1536(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1544(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1152(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1160(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1408(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1416(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1216(%rsp,%rsi), %r11 +; SSE-NEXT: movq 1224(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %r11, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1472(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1480(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1088(%rsp,%rsi), %r9 +; SSE-NEXT: movq 1096(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %r9, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1344(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1352(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1248(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1256(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rax, %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1504(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1512(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1120(%rsp,%rsi), %rax +; SSE-NEXT: movq 1128(%rsp,%rsi), %r8 +; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: shldq %cl, %rax, %r8 +; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1376(%rsp,%rsi), %r13 +; SSE-NEXT: movq 1384(%rsp,%rsi), %rbx +; SSE-NEXT: movq %rbx, %r8 +; SSE-NEXT: shldq %cl, %r13, %r8 +; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1184(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1192(%rsp,%rsi), %r15 +; SSE-NEXT: movq %r15, %r14 +; SSE-NEXT: shldq %cl, %rdx, %r14 +; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1440(%rsp,%rsi), %r10 +; SSE-NEXT: movq 1448(%rsp,%rsi), %rdx +; SSE-NEXT: movq %rdx, %r14 +; SSE-NEXT: shldq %cl, %r10, %r14 +; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1312(%rsp,%rsi), %r14 +; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 1320(%rsp,%rsi), %rbp +; SSE-NEXT: movq %rbp, %r12 +; SSE-NEXT: shldq %cl, %r14, %r12 +; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r8, (%rsp) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq 1064(%rsp,%rsi), %rbx +; SSE-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; SSE-NEXT: shldq %cl, %rbp, %r14 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE-NEXT: shldq %cl, %rdx, %r11 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r15, %rdx +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r15, %r9 +; SSE-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r15, %r8 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r15, %rbp +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r15, %r9 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r15, %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r15, %r13 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r12, %r15 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; SSE-NEXT: shldq %cl, %r12, %r10 +; SSE-NEXT: andq 384(%rdi), %r10 +; SSE-NEXT: andq 128(%rdi), %r15 +; SSE-NEXT: andq 320(%rdi), %r13 +; SSE-NEXT: andq 64(%rdi), %rax +; SSE-NEXT: orq %r10, %r15 +; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: orq %r13, %rax +; SSE-NEXT: andq 448(%rdi), %r9 +; SSE-NEXT: andq 192(%rdi), %rbp +; SSE-NEXT: orq %r9, %rbp +; SSE-NEXT: orq %rax, %rbp +; SSE-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: andq 288(%rdi), %r8 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; SSE-NEXT: andq 32(%rdi), %r9 +; SSE-NEXT: andq 416(%rdi), %rdx +; SSE-NEXT: andq 160(%rdi), %r11 +; SSE-NEXT: orq %r8, %r9 +; SSE-NEXT: orq %rdx, %r11 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE-NEXT: andq 352(%rdi), %rdx +; SSE-NEXT: orq %r9, %r11 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 96(%rdi), %rax +; SSE-NEXT: orq %rdx, %rax +; SSE-NEXT: movq %rax, %rdx +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 480(%rdi), %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: andq 224(%rdi), %r8 +; SSE-NEXT: orq %rax, %r8 +; SSE-NEXT: orq %rdx, %r8 +; SSE-NEXT: andq 272(%rdi), %r14 +; SSE-NEXT: orq %r11, %r8 +; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 16(%rdi), %rax +; SSE-NEXT: orq %r14, %rax +; SSE-NEXT: movq %rax, %r8 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE-NEXT: andq 400(%rdi), %rdx +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 144(%rdi), %rax +; SSE-NEXT: orq %rdx, %rax +; SSE-NEXT: orq %r8, %rax +; SSE-NEXT: movq %rax, %r8 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; SSE-NEXT: andq 336(%rdi), %r9 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 80(%rdi), %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE-NEXT: andq 464(%rdi), %rdx +; SSE-NEXT: orq %r9, %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; SSE-NEXT: andq 208(%rdi), %r11 +; SSE-NEXT: orq %rdx, %r11 +; SSE-NEXT: orq %rax, %r11 +; SSE-NEXT: orq %r8, %r11 +; SSE-NEXT: movq (%rsp), %rdx # 8-byte Reload +; SSE-NEXT: andq 304(%rdi), %rdx +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 48(%rdi), %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; SSE-NEXT: andq 432(%rdi), %r9 +; SSE-NEXT: orq %rdx, %rax +; SSE-NEXT: movq %rax, %r10 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: andq 176(%rdi), %r8 +; SSE-NEXT: orq %r9, %r8 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; SSE-NEXT: andq 368(%rdi), %r9 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 112(%rdi), %rax +; SSE-NEXT: orq %r10, %r8 +; SSE-NEXT: movq %r8, %r10 +; SSE-NEXT: orq %r9, %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: andq 496(%rdi), %r8 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; SSE-NEXT: andq 240(%rdi), %rbp +; SSE-NEXT: orq %r8, %rbp +; SSE-NEXT: orq %rax, %rbp +; SSE-NEXT: orq %r10, %rbp +; SSE-NEXT: orq %r11, %rbp +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 392(%rdi), %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; SSE-NEXT: andq 136(%rdi), %r12 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE-NEXT: andq 328(%rdi), %rdx +; SSE-NEXT: orq %rax, %r12 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 72(%rdi), %rax +; SSE-NEXT: orq %rdx, %rax +; SSE-NEXT: movq %rax, %rdx +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 456(%rdi), %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; SSE-NEXT: andq 200(%rdi), %r13 +; SSE-NEXT: orq %rax, %r13 +; SSE-NEXT: orq %rdx, %r13 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE-NEXT: andq 296(%rdi), %rdx +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 40(%rdi), %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: andq 424(%rdi), %r8 +; SSE-NEXT: orq %rdx, %rax +; SSE-NEXT: movq %rax, %r9 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE-NEXT: andq 168(%rdi), %rdx +; SSE-NEXT: orq %r8, %rdx +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: andq 360(%rdi), %r8 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 104(%rdi), %rax +; SSE-NEXT: orq %r9, %rdx +; SSE-NEXT: orq %r8, %rax +; SSE-NEXT: movq %rax, %r8 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 488(%rdi), %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE-NEXT: andq 232(%rdi), %r15 +; SSE-NEXT: orq %rax, %r15 +; SSE-NEXT: orq %r8, %r15 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: andq 280(%rdi), %r8 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 24(%rdi), %rax +; SSE-NEXT: orq %rdx, %r15 +; SSE-NEXT: orq %r8, %rax +; SSE-NEXT: movq %rax, %r10 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: andq 408(%rdi), %r8 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 152(%rdi), %rax +; SSE-NEXT: orq %r8, %rax +; SSE-NEXT: orq %r10, %rax +; SSE-NEXT: movq %rax, %r10 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; SSE-NEXT: andq 344(%rdi), %r11 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: andq 88(%rdi), %r8 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 472(%rdi), %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; SSE-NEXT: andq 216(%rdi), %r14 +; SSE-NEXT: orq %r11, %r8 +; SSE-NEXT: orq %rax, %r14 +; SSE-NEXT: orq %r8, %r14 +; SSE-NEXT: orq %r10, %r14 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; SSE-NEXT: andq 312(%rdi), %r11 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; SSE-NEXT: andq 56(%rdi), %r10 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: andq 440(%rdi), %r8 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; SSE-NEXT: andq 184(%rdi), %r9 +; SSE-NEXT: orq %r11, %r10 +; SSE-NEXT: orq %r8, %r9 +; SSE-NEXT: orq %r10, %r9 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE-NEXT: shldq %cl, %rax, %rdx +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; SSE-NEXT: andq 376(%rdi), %r10 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andq 120(%rdi), %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; SSE-NEXT: andq 504(%rdi), %r11 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: andq 248(%rdi), %r8 +; SSE-NEXT: orq %r10, %rax +; SSE-NEXT: movq %rax, %r10 +; SSE-NEXT: orq %r11, %r8 +; SSE-NEXT: movq 1056(%rsp,%rsi), %rax +; SSE-NEXT: shldq %cl, %rax, %rbx +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: shlq %cl, %rax +; SSE-NEXT: orq %r10, %r8 +; SSE-NEXT: orq %r9, %r8 +; SSE-NEXT: andq 256(%rdi), %rdx +; SSE-NEXT: orq %r14, %r8 +; SSE-NEXT: andq (%rdi), %rax +; SSE-NEXT: orq %rdx, %rax +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; SSE-NEXT: orq %rbp, %rax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE-NEXT: andq 264(%rdi), %rcx +; SSE-NEXT: andq 8(%rdi), %rbx +; SSE-NEXT: orq %rcx, %rbx +; SSE-NEXT: orq %r12, %rbx +; SSE-NEXT: orq %r13, %rbx +; SSE-NEXT: orq %r15, %rbx +; SSE-NEXT: orq %r8, %rbx +; SSE-NEXT: orq %rax, %rbx +; SSE-NEXT: setne %al +; SSE-NEXT: addq $1576, %rsp # imm = 0x628 +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r12 +; SSE-NEXT: popq %r13 +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: popq %rbp +; SSE-NEXT: retq +; +; AVX2-LABEL: test_ne_i4096: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $1560, %rsp # imm = 0x618 +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: movl %esi, %eax +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: andl $4032, %eax # imm = 0xFC0 +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] +; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: shrl $3, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: movslq %eax, %rsi +; AVX2-NEXT: movq 1280(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1288(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1536(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1544(%rsp,%rsi), %rax +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1152(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1160(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1408(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1416(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1216(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, (%rsp) # 8-byte Spill +; AVX2-NEXT: movq 1224(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1472(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1480(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1088(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1096(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1344(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1352(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1248(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1256(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1504(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1512(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1120(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1128(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1376(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1384(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1184(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1192(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1440(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1448(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1056(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1064(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1312(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1320(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1264(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1272(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1520(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1528(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1136(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1144(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1392(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1400(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1200(%rsp,%rsi), %r11 +; AVX2-NEXT: movq 1208(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %r11, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1456(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1464(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1072(%rsp,%rsi), %r12 +; AVX2-NEXT: movq 1080(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %r12, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1328(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1336(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rdx, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1232(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1240(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rax, %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1488(%rsp,%rsi), %rbp +; AVX2-NEXT: movq 1496(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rbp, %rax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1104(%rsp,%rsi), %rax +; AVX2-NEXT: movq 1112(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rax, %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1360(%rsp,%rsi), %r10 +; AVX2-NEXT: movq 1368(%rsp,%rsi), %r8 +; AVX2-NEXT: movq %r8, %rdx +; AVX2-NEXT: shldq %cl, %r10, %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1168(%rsp,%rsi), %r9 +; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1176(%rsp,%rsi), %rbx +; AVX2-NEXT: movq %rbx, %rdx +; AVX2-NEXT: shldq %cl, %r9, %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1424(%rsp,%rsi), %r9 +; AVX2-NEXT: movq 1432(%rsp,%rsi), %rdx +; AVX2-NEXT: movq %rdx, %r14 +; AVX2-NEXT: shldq %cl, %r9, %r14 +; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1296(%rsp,%rsi), %r15 +; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 1304(%rsp,%rsi), %r14 +; AVX2-NEXT: movq %r14, %r13 +; AVX2-NEXT: shldq %cl, %r15, %r13 +; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r15, (%rsp) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq 1048(%rsp,%rsi), %rdx +; AVX2-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r14, %rbx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r14, %r11 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r14, %r12 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r14, %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r14, %r13 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r14, %rbp +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r14, %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r14, %r10 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r15, %r14 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r15, %r9 +; AVX2-NEXT: andq 384(%rdi), %r9 +; AVX2-NEXT: andq 128(%rdi), %r14 +; AVX2-NEXT: andq 320(%rdi), %r10 +; AVX2-NEXT: orq %r9, %r14 +; AVX2-NEXT: movq %r14, %r15 +; AVX2-NEXT: andq 64(%rdi), %rax +; AVX2-NEXT: orq %r10, %rax +; AVX2-NEXT: andq 448(%rdi), %rbp +; AVX2-NEXT: andq 192(%rdi), %r13 +; AVX2-NEXT: orq %rbp, %r13 +; AVX2-NEXT: orq %rax, %r13 +; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: andq 288(%rdi), %r8 +; AVX2-NEXT: andq 32(%rdi), %r12 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 416(%rdi), %rax +; AVX2-NEXT: orq %r8, %r12 +; AVX2-NEXT: andq 160(%rdi), %r11 +; AVX2-NEXT: orq %rax, %r11 +; AVX2-NEXT: andq 352(%rdi), %rbx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 96(%rdi), %rax +; AVX2-NEXT: orq %r12, %r11 +; AVX2-NEXT: orq %rbx, %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX2-NEXT: andq 480(%rdi), %r10 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX2-NEXT: andq 224(%rdi), %r13 +; AVX2-NEXT: orq %r10, %r13 +; AVX2-NEXT: orq %rax, %r13 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: andq 272(%rdi), %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 16(%rdi), %rax +; AVX2-NEXT: orq %r11, %r13 +; AVX2-NEXT: orq %r8, %rax +; AVX2-NEXT: movq %rax, %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; AVX2-NEXT: andq 400(%rdi), %r9 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 144(%rdi), %rax +; AVX2-NEXT: orq %r9, %rax +; AVX2-NEXT: orq %r8, %rax +; AVX2-NEXT: movq %rax, %r9 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX2-NEXT: andq 336(%rdi), %r10 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 80(%rdi), %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: andq 464(%rdi), %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; AVX2-NEXT: andq 208(%rdi), %r11 +; AVX2-NEXT: orq %r10, %rax +; AVX2-NEXT: orq %r8, %r11 +; AVX2-NEXT: orq %rax, %r11 +; AVX2-NEXT: orq %r9, %r11 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; AVX2-NEXT: andq 304(%rdi), %r9 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: andq 48(%rdi), %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX2-NEXT: andq 432(%rdi), %r10 +; AVX2-NEXT: movq (%rsp), %rax # 8-byte Reload +; AVX2-NEXT: andq 176(%rdi), %rax +; AVX2-NEXT: orq %r9, %r8 +; AVX2-NEXT: movq %r8, %r9 +; AVX2-NEXT: orq %r10, %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: andq 368(%rdi), %r8 +; AVX2-NEXT: orq %r9, %rax +; AVX2-NEXT: movq %rax, %r10 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 112(%rdi), %rax +; AVX2-NEXT: orq %r8, %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: andq 496(%rdi), %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; AVX2-NEXT: andq 240(%rdi), %r9 +; AVX2-NEXT: orq %r8, %r9 +; AVX2-NEXT: orq %rax, %r9 +; AVX2-NEXT: orq %r10, %r9 +; AVX2-NEXT: orq %r11, %r9 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX2-NEXT: andq 392(%rdi), %r10 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; AVX2-NEXT: andq 136(%rdi), %rbp +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: andq 328(%rdi), %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 72(%rdi), %rax +; AVX2-NEXT: orq %r10, %rbp +; AVX2-NEXT: orq %r8, %rax +; AVX2-NEXT: movq %rax, %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 456(%rdi), %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; AVX2-NEXT: andq 200(%rdi), %r12 +; AVX2-NEXT: orq %rax, %r12 +; AVX2-NEXT: orq %r8, %r12 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX2-NEXT: andq 296(%rdi), %r10 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: andq 40(%rdi), %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; AVX2-NEXT: andq 424(%rdi), %r11 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 168(%rdi), %rax +; AVX2-NEXT: orq %r10, %r8 +; AVX2-NEXT: movq %r8, %r10 +; AVX2-NEXT: orq %r11, %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: andq 360(%rdi), %r8 +; AVX2-NEXT: orq %r10, %rax +; AVX2-NEXT: movq %rax, %r10 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 104(%rdi), %rax +; AVX2-NEXT: orq %r8, %rax +; AVX2-NEXT: movq %rax, %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 488(%rdi), %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX2-NEXT: andq 232(%rdi), %r14 +; AVX2-NEXT: orq %rax, %r14 +; AVX2-NEXT: orq %r8, %r14 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: andq 280(%rdi), %r8 +; AVX2-NEXT: orq %r10, %r14 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 24(%rdi), %rax +; AVX2-NEXT: orq %r8, %rax +; AVX2-NEXT: movq %rax, %r10 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: andq 408(%rdi), %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 152(%rdi), %rax +; AVX2-NEXT: orq %r8, %rax +; AVX2-NEXT: orq %r10, %rax +; AVX2-NEXT: movq %rax, %r10 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; AVX2-NEXT: andq 344(%rdi), %r11 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: andq 88(%rdi), %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 472(%rdi), %rax +; AVX2-NEXT: orq %r11, %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX2-NEXT: andq 216(%rdi), %rbx +; AVX2-NEXT: orq %rax, %rbx +; AVX2-NEXT: orq %r8, %rbx +; AVX2-NEXT: orq %r10, %rbx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: andq 312(%rdi), %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 56(%rdi), %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX2-NEXT: andq 440(%rdi), %r10 +; AVX2-NEXT: orq %r8, %rax +; AVX2-NEXT: movq %rax, %r11 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: andq 184(%rdi), %r8 +; AVX2-NEXT: orq %r10, %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX2-NEXT: andq 376(%rdi), %r10 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 120(%rdi), %rax +; AVX2-NEXT: orq %r11, %r8 +; AVX2-NEXT: movq %r8, %r11 +; AVX2-NEXT: orq %r10, %rax +; AVX2-NEXT: movq %rax, %r10 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: andq 504(%rdi), %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: andq 248(%rdi), %rax +; AVX2-NEXT: orq %r8, %rax +; AVX2-NEXT: orq %r10, %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX2-NEXT: shldq %cl, %r8, %r10 +; AVX2-NEXT: orq %r11, %rax +; AVX2-NEXT: movq 1040(%rsp,%rsi), %rsi +; AVX2-NEXT: orq %rbx, %rax +; AVX2-NEXT: movq %rax, %r8 +; AVX2-NEXT: shlxq %rcx, %rsi, %rax +; AVX2-NEXT: andq 256(%rdi), %r10 +; AVX2-NEXT: andq (%rdi), %rax +; AVX2-NEXT: orq %r10, %rax +; AVX2-NEXT: orq %r15, %rax +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; AVX2-NEXT: orq %r13, %rax +; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX2-NEXT: shldq %cl, %rsi, %rdx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX2-NEXT: andq 264(%rdi), %rcx +; AVX2-NEXT: andq 8(%rdi), %rdx +; AVX2-NEXT: orq %r9, %rax +; AVX2-NEXT: orq %rcx, %rdx +; AVX2-NEXT: orq %rbp, %rdx +; AVX2-NEXT: orq %r12, %rdx +; AVX2-NEXT: orq %r14, %rdx +; AVX2-NEXT: orq %r8, %rdx +; AVX2-NEXT: orq %rax, %rdx +; AVX2-NEXT: setne %al +; AVX2-NEXT: addq $1560, %rsp # imm = 0x618 +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_ne_i4096: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $1560, %rsp # imm = 0x618 +; AVX512-NEXT: movl %esi, %ecx +; AVX512-NEXT: movl %esi, %eax +; AVX512-NEXT: andl $4032, %eax # imm = 0xFC0 +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] +; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: andl $63, %ecx +; AVX512-NEXT: shrl $3, %eax +; AVX512-NEXT: negl %eax +; AVX512-NEXT: movslq %eax, %rsi +; AVX512-NEXT: movq 1280(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1288(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1536(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1544(%rsp,%rsi), %rax +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1152(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1160(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1408(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1416(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1216(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, (%rsp) # 8-byte Spill +; AVX512-NEXT: movq 1224(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1472(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1480(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1088(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1096(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1344(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1352(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1248(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1256(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1504(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1512(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1120(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1128(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1376(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1384(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1184(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1192(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1440(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1448(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1056(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1064(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1312(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1320(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1264(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1272(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1520(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1528(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1136(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1144(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1392(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1400(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1200(%rsp,%rsi), %r10 +; AVX512-NEXT: movq 1208(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %r10, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1456(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1464(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1072(%rsp,%rsi), %r14 +; AVX512-NEXT: movq 1080(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %r14, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1328(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1336(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rdx, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1232(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1240(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rax, %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1488(%rsp,%rsi), %r12 +; AVX512-NEXT: movq 1496(%rsp,%rsi), %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %r12, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1104(%rsp,%rsi), %rax +; AVX512-NEXT: movq 1112(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq %cl, %rax, %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1360(%rsp,%rsi), %r11 +; AVX512-NEXT: movq 1368(%rsp,%rsi), %rbx +; AVX512-NEXT: movq %rbx, %rdx +; AVX512-NEXT: shldq %cl, %r11, %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1168(%rsp,%rsi), %r9 +; AVX512-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1176(%rsp,%rsi), %r8 +; AVX512-NEXT: movq %r8, %rdx +; AVX512-NEXT: shldq %cl, %r9, %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1424(%rsp,%rsi), %r9 +; AVX512-NEXT: movq 1432(%rsp,%rsi), %rdx +; AVX512-NEXT: movq %rdx, %r15 +; AVX512-NEXT: shldq %cl, %r9, %r15 +; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1296(%rsp,%rsi), %rbp +; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 1304(%rsp,%rsi), %r15 +; AVX512-NEXT: movq %r15, %r13 +; AVX512-NEXT: shldq %cl, %rbp, %r13 +; AVX512-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r13, (%rsp) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq 1048(%rsp,%rsi), %rdx +; AVX512-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r15, %rbx +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r15, %r10 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r15, %r14 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r15, %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r15, %r13 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r15, %r12 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r15, %rax +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %r15, %r11 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; AVX512-NEXT: shldq %cl, %rbp, %r15 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; AVX512-NEXT: shldq %cl, %rbp, %r9 +; AVX512-NEXT: andq 384(%rdi), %r9 +; AVX512-NEXT: andq 128(%rdi), %r15 +; AVX512-NEXT: orq %r9, %r15 +; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: andq 320(%rdi), %r11 +; AVX512-NEXT: andq 64(%rdi), %rax +; AVX512-NEXT: orq %r11, %rax +; AVX512-NEXT: andq 448(%rdi), %r12 +; AVX512-NEXT: andq 192(%rdi), %r13 +; AVX512-NEXT: orq %r12, %r13 +; AVX512-NEXT: orq %rax, %r13 +; AVX512-NEXT: andq 288(%rdi), %r8 +; AVX512-NEXT: andq 32(%rdi), %r14 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 416(%rdi), %rax +; AVX512-NEXT: orq %r8, %r14 +; AVX512-NEXT: andq 160(%rdi), %r10 +; AVX512-NEXT: orq %rax, %r10 +; AVX512-NEXT: andq 352(%rdi), %rbx +; AVX512-NEXT: orq %r14, %r10 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 96(%rdi), %rax +; AVX512-NEXT: orq %rbx, %rax +; AVX512-NEXT: movq %rax, %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 480(%rdi), %rax +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX512-NEXT: andq 224(%rdi), %r15 +; AVX512-NEXT: orq %rax, %r15 +; AVX512-NEXT: orq %r8, %r15 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: andq 272(%rdi), %r8 +; AVX512-NEXT: orq %r10, %r15 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 16(%rdi), %rax +; AVX512-NEXT: orq %r8, %rax +; AVX512-NEXT: movq %rax, %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; AVX512-NEXT: andq 400(%rdi), %r9 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 144(%rdi), %rax +; AVX512-NEXT: orq %r9, %rax +; AVX512-NEXT: orq %r8, %rax +; AVX512-NEXT: movq %rax, %r9 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX512-NEXT: andq 336(%rdi), %r10 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 80(%rdi), %rax +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: andq 464(%rdi), %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; AVX512-NEXT: andq 208(%rdi), %r11 +; AVX512-NEXT: orq %r10, %rax +; AVX512-NEXT: orq %r8, %r11 +; AVX512-NEXT: orq %rax, %r11 +; AVX512-NEXT: orq %r9, %r11 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX512-NEXT: andq 304(%rdi), %r10 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 48(%rdi), %rax +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; AVX512-NEXT: andq 432(%rdi), %r9 +; AVX512-NEXT: movq (%rsp), %r8 # 8-byte Reload +; AVX512-NEXT: andq 176(%rdi), %r8 +; AVX512-NEXT: orq %r10, %rax +; AVX512-NEXT: movq %rax, %r10 +; AVX512-NEXT: orq %r9, %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; AVX512-NEXT: andq 368(%rdi), %r9 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 112(%rdi), %rax +; AVX512-NEXT: orq %r10, %r8 +; AVX512-NEXT: movq %r8, %r10 +; AVX512-NEXT: orq %r9, %rax +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: andq 496(%rdi), %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; AVX512-NEXT: andq 240(%rdi), %r9 +; AVX512-NEXT: orq %r8, %r9 +; AVX512-NEXT: orq %rax, %r9 +; AVX512-NEXT: orq %r10, %r9 +; AVX512-NEXT: orq %r11, %r9 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX512-NEXT: andq 392(%rdi), %r10 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; AVX512-NEXT: andq 136(%rdi), %rbp +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: andq 328(%rdi), %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 72(%rdi), %rax +; AVX512-NEXT: orq %r10, %rbp +; AVX512-NEXT: orq %r8, %rax +; AVX512-NEXT: movq %rax, %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 456(%rdi), %rax +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; AVX512-NEXT: andq 200(%rdi), %r12 +; AVX512-NEXT: orq %rax, %r12 +; AVX512-NEXT: orq %r8, %r12 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: andq 296(%rdi), %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 40(%rdi), %rax +; AVX512-NEXT: orq %r8, %rax +; AVX512-NEXT: movq %rax, %r10 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: andq 424(%rdi), %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 168(%rdi), %rax +; AVX512-NEXT: orq %r8, %rax +; AVX512-NEXT: orq %r10, %rax +; AVX512-NEXT: movq %rax, %r10 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: andq 360(%rdi), %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 104(%rdi), %rax +; AVX512-NEXT: orq %r8, %rax +; AVX512-NEXT: movq %rax, %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 488(%rdi), %rax +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX512-NEXT: andq 232(%rdi), %r14 +; AVX512-NEXT: orq %rax, %r14 +; AVX512-NEXT: orq %r8, %r14 +; AVX512-NEXT: orq %r10, %r14 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: andq 280(%rdi), %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 24(%rdi), %rax +; AVX512-NEXT: orq %r8, %rax +; AVX512-NEXT: movq %rax, %r10 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: andq 408(%rdi), %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 152(%rdi), %rax +; AVX512-NEXT: orq %r8, %rax +; AVX512-NEXT: orq %r10, %rax +; AVX512-NEXT: movq %rax, %r10 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; AVX512-NEXT: andq 344(%rdi), %r11 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: andq 88(%rdi), %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 472(%rdi), %rax +; AVX512-NEXT: orq %r11, %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX512-NEXT: andq 216(%rdi), %rbx +; AVX512-NEXT: orq %rax, %rbx +; AVX512-NEXT: orq %r8, %rbx +; AVX512-NEXT: orq %r10, %rbx +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX512-NEXT: andq 312(%rdi), %r10 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 56(%rdi), %rax +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: andq 440(%rdi), %r8 +; AVX512-NEXT: orq %r10, %rax +; AVX512-NEXT: movq %rax, %r10 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 184(%rdi), %rax +; AVX512-NEXT: orq %r8, %rax +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: andq 376(%rdi), %r8 +; AVX512-NEXT: orq %r10, %rax +; AVX512-NEXT: movq %rax, %r11 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 120(%rdi), %rax +; AVX512-NEXT: orq %r8, %rax +; AVX512-NEXT: movq %rax, %r10 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 504(%rdi), %rax +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: andq 248(%rdi), %r8 +; AVX512-NEXT: orq %rax, %r8 +; AVX512-NEXT: orq %r10, %r8 +; AVX512-NEXT: orq %r11, %r8 +; AVX512-NEXT: movq 1040(%rsp,%rsi), %rax +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX512-NEXT: shldq %cl, %rsi, %r10 +; AVX512-NEXT: orq %rbx, %r8 +; AVX512-NEXT: shlxq %rcx, %rax, %rsi +; AVX512-NEXT: andq 256(%rdi), %r10 +; AVX512-NEXT: andq (%rdi), %rsi +; AVX512-NEXT: orq %r10, %rsi +; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; AVX512-NEXT: orq %r13, %rsi +; AVX512-NEXT: orq %r15, %rsi +; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX512-NEXT: shldq %cl, %rax, %rdx +; AVX512-NEXT: orq %r9, %rsi +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: andq 264(%rdi), %rax +; AVX512-NEXT: andq 8(%rdi), %rdx +; AVX512-NEXT: orq %rax, %rdx +; AVX512-NEXT: orq %rbp, %rdx +; AVX512-NEXT: orq %r12, %rdx +; AVX512-NEXT: orq %r14, %rdx +; AVX512-NEXT: orq %r8, %rdx +; AVX512-NEXT: orq %rsi, %rdx +; AVX512-NEXT: setne %al +; AVX512-NEXT: addq $1560, %rsp # imm = 0x618 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %rem = and i32 %position, 4095 %ofs = zext nneg i32 %rem to i4096 %bit = shl nuw i4096 1, %ofs @@ -1812,8 +7161,8 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $64, %esp -; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: subl $80, %esp +; X86-NEXT: movzbl 12(%ebp), %ecx ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -1826,41 +7175,52 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind { ; X86-NEXT: shrb $3, %al ; X86-NEXT: andb $12, %al ; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %edi -; X86-NEXT: movl 36(%esp,%edi), %edx -; X86-NEXT: movl 40(%esp,%edi), %ebx -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl 32(%esp,%edi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%esp,%edi), %edi -; X86-NEXT: shldl %cl, %ebx, %edi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %ebx -; X86-NEXT: notl %ebx -; X86-NEXT: movl 16(%ebp), %eax -; X86-NEXT: movl (%eax), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%ebp), %eax -; X86-NEXT: andl $96, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: movl (%ecx,%eax), %eax -; X86-NEXT: andl %ebx, (%ecx) -; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 56(%esp,%eax), %esi +; X86-NEXT: movl 60(%esp,%eax), %edx +; X86-NEXT: shldl %cl, %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 48(%esp,%eax), %edi +; X86-NEXT: movl 52(%esp,%eax), %eax +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %eax +; X86-NEXT: movl 8(%ebp), %ebx +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movl 8(%ebx), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %esi, %ecx +; X86-NEXT: movl (%ebx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %edi, %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: movl 12(%ebx), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %eax, %edx +; X86-NEXT: movl 4(%ebx), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl %ebx, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: notl %ecx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %edx +; X86-NEXT: notl %ebx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: notl %edx -; X86-NEXT: movl 8(%ebp), %ebx -; X86-NEXT: andl %edx, 4(%ebx) -; X86-NEXT: notl %esi -; X86-NEXT: andl %esi, 8(%ebx) +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: notl %edi -; X86-NEXT: andl %edi, 12(%ebx) -; X86-NEXT: btl %ecx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: jae .LBB22_2 +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movl (%eax), %eax +; X86-NEXT: movl 8(%ebp), %esi +; X86-NEXT: movl %ebx, 8(%esi) +; X86-NEXT: movl %ecx, 12(%esi) +; X86-NEXT: movl %edi, (%esi) +; X86-NEXT: movl %edx, 4(%esi) +; X86-NEXT: je .LBB22_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: .LBB22_2: @@ -1882,75 +7242,52 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind { ; SSE-NEXT: testb $64, %cl ; SSE-NEXT: cmovneq %rsi, %r8 ; SSE-NEXT: cmovneq %rax, %rsi +; SSE-NEXT: movq (%rdi), %rcx +; SSE-NEXT: movq 8(%rdi), %r9 +; SSE-NEXT: movq %r9, %r10 +; SSE-NEXT: andq %r8, %r10 ; SSE-NEXT: notq %r8 +; SSE-NEXT: movq %rcx, %r11 +; SSE-NEXT: andq %rsi, %r11 ; SSE-NEXT: notq %rsi -; SSE-NEXT: movl %ecx, %r9d -; SSE-NEXT: andl $96, %r9d -; SSE-NEXT: shrl $3, %r9d -; SSE-NEXT: movl (%rdi,%r9), %r9d -; SSE-NEXT: btl %ecx, %r9d -; SSE-NEXT: jb .LBB22_2 +; SSE-NEXT: andq %r9, %r8 +; SSE-NEXT: andq %rcx, %rsi +; SSE-NEXT: orq %r10, %r11 +; SSE-NEXT: jne .LBB22_2 ; SSE-NEXT: # %bb.1: ; SSE-NEXT: movl (%rdx), %eax ; SSE-NEXT: .LBB22_2: -; SSE-NEXT: andq %r8, 8(%rdi) -; SSE-NEXT: andq %rsi, (%rdi) +; SSE-NEXT: movq %rsi, (%rdi) +; SSE-NEXT: movq %r8, 8(%rdi) ; SSE-NEXT: # kill: def $eax killed $eax killed $rax ; SSE-NEXT: retq ; -; AVX2-LABEL: reset_multiload_i128: -; AVX2: # %bb.0: -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: movl $1, %r8d -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: shldq %cl, %r8, %rsi -; AVX2-NEXT: shlxq %rcx, %r8, %r8 -; AVX2-NEXT: testb $64, %cl -; AVX2-NEXT: cmovneq %r8, %rsi -; AVX2-NEXT: cmovneq %rax, %r8 -; AVX2-NEXT: notq %rsi -; AVX2-NEXT: notq %r8 -; AVX2-NEXT: movl %ecx, %r9d -; AVX2-NEXT: andl $96, %r9d -; AVX2-NEXT: shrl $3, %r9d -; AVX2-NEXT: movl (%rdi,%r9), %r9d -; AVX2-NEXT: btl %ecx, %r9d -; AVX2-NEXT: jb .LBB22_2 -; AVX2-NEXT: # %bb.1: -; AVX2-NEXT: movl (%rdx), %eax -; AVX2-NEXT: .LBB22_2: -; AVX2-NEXT: andq %rsi, 8(%rdi) -; AVX2-NEXT: andq %r8, (%rdi) -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax -; AVX2-NEXT: retq -; -; AVX512-LABEL: reset_multiload_i128: -; AVX512: # %bb.0: -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: movl $1, %r8d -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: shldq %cl, %r8, %rsi -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: shlxq %rcx, %r8, %r8 -; AVX512-NEXT: testb $64, %cl -; AVX512-NEXT: cmovneq %r8, %rsi -; AVX512-NEXT: cmovneq %rax, %r8 -; AVX512-NEXT: notq %rsi -; AVX512-NEXT: notq %r8 -; AVX512-NEXT: movl %ecx, %r9d -; AVX512-NEXT: andl $96, %r9d -; AVX512-NEXT: shrl $3, %r9d -; AVX512-NEXT: movl (%rdi,%r9), %r9d -; AVX512-NEXT: btl %ecx, %r9d -; AVX512-NEXT: jb .LBB22_2 -; AVX512-NEXT: # %bb.1: -; AVX512-NEXT: movl (%rdx), %eax -; AVX512-NEXT: .LBB22_2: -; AVX512-NEXT: andq %rsi, 8(%rdi) -; AVX512-NEXT: andq %r8, (%rdi) -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512-NEXT: retq +; AVX-LABEL: reset_multiload_i128: +; AVX: # %bb.0: +; AVX-NEXT: movl %esi, %ecx +; AVX-NEXT: movl $1, %esi +; AVX-NEXT: xorl %r8d, %r8d +; AVX-NEXT: shldq %cl, %rsi, %r8 +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: shlxq %rcx, %rsi, %r9 +; AVX-NEXT: testb $64, %cl +; AVX-NEXT: cmovneq %r9, %r8 +; AVX-NEXT: cmovneq %rax, %r9 +; AVX-NEXT: movq (%rdi), %r10 +; AVX-NEXT: movq 8(%rdi), %r11 +; AVX-NEXT: andnq %r11, %r8, %rcx +; AVX-NEXT: andq %r8, %r11 +; AVX-NEXT: andnq %r10, %r9, %rsi +; AVX-NEXT: andq %r9, %r10 +; AVX-NEXT: orq %r11, %r10 +; AVX-NEXT: jne .LBB22_2 +; AVX-NEXT: # %bb.1: +; AVX-NEXT: movl (%rdx), %eax +; AVX-NEXT: .LBB22_2: +; AVX-NEXT: movq %rsi, (%rdi) +; AVX-NEXT: movq %rcx, 8(%rdi) +; AVX-NEXT: # kill: def $eax killed $eax killed $rax +; AVX-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs diff --git a/llvm/test/CodeGen/X86/ipra-reg-usage.ll b/llvm/test/CodeGen/X86/ipra-reg-usage.ll index e73ff79..f270f8f 100644 --- a/llvm/test/CodeGen/X86/ipra-reg-usage.ll +++ b/llvm/test/CodeGen/X86/ipra-reg-usage.ll @@ -7,7 +7,7 @@ target triple = "x86_64-unknown-unknown" declare void @bar1() define preserve_allcc void @foo()#0 { -; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hip $hsp $ip $mxcsr $rflags $rip $riz $rsp $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $r11b $r11bh $r11d $r11w $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $tmm0_tmm1 $tmm2_tmm3 $tmm4_tmm5 $tmm6_tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh +; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hip $hsp $ip $mxcsr $rflags $rip $riz $rsp $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $r11b $r11bh $r11d $r11w $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh call void @bar1() call void @bar2() ret void @@ -15,7 +15,7 @@ define preserve_allcc void @foo()#0 { declare void @bar2() define preserve_nonecc void @foo2()#0 { -; CHECK: foo2 Clobbered Registers: $ah $al $ax $ch $cl $cs $cx $df $dh $di $dih $dil $dl $ds $dx $eax $ecx $edi $edx $eflags $eip $eiz $es $esi $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hax $hcx $hdi $hdx $hip $hsi $hsp $ip $mxcsr $rax $rcx $rdi $rdx $rflags $rip $riz $rsi $rsp $si $sih $sil $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r8 $r9 $r10 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $xmm0 $xmm1 $xmm2 $xmm3 $xmm4 $xmm5 $xmm6 $xmm7 $xmm8 $xmm9 $xmm10 $xmm11 $xmm12 $xmm13 $xmm14 $xmm15 $r8b $r9b $r10b $r11b $r8bh $r9bh $r10bh $r11bh $r8d $r9d $r10d $r11d $r8w $r9w $r10w $r11w $r8wh $r9wh $r10wh $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $tmm0_tmm1 $tmm2_tmm3 $tmm4_tmm5 $tmm6_tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh +; CHECK: foo2 Clobbered Registers: $ah $al $ax $ch $cl $cs $cx $df $dh $di $dih $dil $dl $ds $dx $eax $ecx $edi $edx $eflags $eip $eiz $es $esi $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hax $hcx $hdi $hdx $hip $hsi $hsp $ip $mxcsr $rax $rcx $rdi $rdx $rflags $rip $riz $rsi $rsp $si $sih $sil $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r8 $r9 $r10 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $xmm0 $xmm1 $xmm2 $xmm3 $xmm4 $xmm5 $xmm6 $xmm7 $xmm8 $xmm9 $xmm10 $xmm11 $xmm12 $xmm13 $xmm14 $xmm15 $r8b $r9b $r10b $r11b $r8bh $r9bh $r10bh $r11bh $r8d $r9d $r10d $r11d $r8w $r9w $r10w $r11w $r8wh $r9wh $r10wh $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh call void @bar1() call void @bar2() ret void diff --git a/llvm/test/CodeGen/X86/pr165755.ll b/llvm/test/CodeGen/X86/pr165755.ll new file mode 100644 index 0000000..3ab484f --- /dev/null +++ b/llvm/test/CodeGen/X86/pr165755.ll @@ -0,0 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefixes=X64 + +define i32 @PR165755(ptr %p0) { +; X86-LABEL: PR165755: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %eax +; X86-NEXT: movb $0, (%ecx) +; X86-NEXT: retl +; +; X64-LABEL: PR165755: +; X64: # %bb.0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movb $0, (%rdi) +; X64-NEXT: retq + %ld64 = load i64, ptr %p0, align 8 + store i8 0, ptr %p0, align 1 + %ld32 = load i32, ptr %p0, align 8 + %mask = and i32 %ld32, 32 + %zext = zext i32 %mask to i64 + %srl = lshr i64 %ld64, %zext + %res = trunc i64 %srl to i32 + ret i32 %res +} |
