aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/X86
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/X86')
-rw-r--r--llvm/test/CodeGen/X86/and-mask-variable.ll12
-rw-r--r--llvm/test/CodeGen/X86/atomic-load-store.ll504
-rw-r--r--llvm/test/CodeGen/X86/bfloat-calling-conv.ll6
-rw-r--r--llvm/test/CodeGen/X86/trunc-srl-load.ll1652
-rw-r--r--llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll69
-rw-r--r--llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll50
-rw-r--r--llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll53
7 files changed, 312 insertions, 2034 deletions
diff --git a/llvm/test/CodeGen/X86/and-mask-variable.ll b/llvm/test/CodeGen/X86/and-mask-variable.ll
index d89f0db..3e5bd69 100644
--- a/llvm/test/CodeGen/X86/and-mask-variable.ll
+++ b/llvm/test/CodeGen/X86/and-mask-variable.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=-bmi,-tbm,-bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X86-NOBMI
-; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+bmi,+tbm,+bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X86-BMI2
-; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+bmi,-tbm,+bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X86-BMI2
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=-bmi,-tbm,-bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X64-NOBMI
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+bmi,+tbm,+bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X64-BMI2
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+bmi,-tbm,+bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X64-BMI2
+; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=-bmi,-tbm,-bmi2 < %s | FileCheck %s --check-prefixes=X86-NOBMI
+; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+bmi,+tbm,+bmi2 < %s | FileCheck %s --check-prefixes=X86-BMI2
+; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+bmi,-tbm,+bmi2 < %s | FileCheck %s --check-prefixes=X86-BMI2
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=-bmi,-tbm,-bmi2 < %s | FileCheck %s --check-prefixes=X64-NOBMI
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+bmi,+tbm,+bmi2 < %s | FileCheck %s --check-prefixes=X64-BMI2
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+bmi,-tbm,+bmi2 < %s | FileCheck %s --check-prefixes=X64-BMI2
define i32 @mask_pair(i32 %x, i32 %y) nounwind {
; X86-NOBMI-LABEL: mask_pair:
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 3e7b73a..1173c45 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,CHECK-O3
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-SSE-O3
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O3
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O3
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,CHECK-O0
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-SSE-O0
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O0
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O0
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-SSE-O3
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-SSE-O3
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-AVX-O3,CHECK-AVX2-O3
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-AVX-O3,CHECK-AVX512-O3
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-SSE-O0
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-SSE-O0
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-AVX-O0,CHECK-AVX2-O0
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-AVX-O0,CHECK-AVX512-O0
define void @test1(ptr %ptr, i32 %val1) {
; CHECK-LABEL: test1:
@@ -50,30 +50,10 @@ define <1 x i8> @atomic_vec1_i8(ptr %x) {
; CHECK-O3-NEXT: movzbl (%rdi), %eax
; CHECK-O3-NEXT: retq
;
-; CHECK-SSE-O3-LABEL: atomic_vec1_i8:
-; CHECK-SSE-O3: # %bb.0:
-; CHECK-SSE-O3-NEXT: movzbl (%rdi), %eax
-; CHECK-SSE-O3-NEXT: retq
-;
-; CHECK-AVX-O3-LABEL: atomic_vec1_i8:
-; CHECK-AVX-O3: # %bb.0:
-; CHECK-AVX-O3-NEXT: movzbl (%rdi), %eax
-; CHECK-AVX-O3-NEXT: retq
-;
; CHECK-O0-LABEL: atomic_vec1_i8:
; CHECK-O0: # %bb.0:
; CHECK-O0-NEXT: movb (%rdi), %al
; CHECK-O0-NEXT: retq
-;
-; CHECK-SSE-O0-LABEL: atomic_vec1_i8:
-; CHECK-SSE-O0: # %bb.0:
-; CHECK-SSE-O0-NEXT: movb (%rdi), %al
-; CHECK-SSE-O0-NEXT: retq
-;
-; CHECK-AVX-O0-LABEL: atomic_vec1_i8:
-; CHECK-AVX-O0: # %bb.0:
-; CHECK-AVX-O0-NEXT: movb (%rdi), %al
-; CHECK-AVX-O0-NEXT: retq
%ret = load atomic <1 x i8>, ptr %x acquire, align 1
ret <1 x i8> %ret
}
@@ -84,30 +64,10 @@ define <1 x i16> @atomic_vec1_i16(ptr %x) {
; CHECK-O3-NEXT: movzwl (%rdi), %eax
; CHECK-O3-NEXT: retq
;
-; CHECK-SSE-O3-LABEL: atomic_vec1_i16:
-; CHECK-SSE-O3: # %bb.0:
-; CHECK-SSE-O3-NEXT: movzwl (%rdi), %eax
-; CHECK-SSE-O3-NEXT: retq
-;
-; CHECK-AVX-O3-LABEL: atomic_vec1_i16:
-; CHECK-AVX-O3: # %bb.0:
-; CHECK-AVX-O3-NEXT: movzwl (%rdi), %eax
-; CHECK-AVX-O3-NEXT: retq
-;
; CHECK-O0-LABEL: atomic_vec1_i16:
; CHECK-O0: # %bb.0:
; CHECK-O0-NEXT: movw (%rdi), %ax
; CHECK-O0-NEXT: retq
-;
-; CHECK-SSE-O0-LABEL: atomic_vec1_i16:
-; CHECK-SSE-O0: # %bb.0:
-; CHECK-SSE-O0-NEXT: movw (%rdi), %ax
-; CHECK-SSE-O0-NEXT: retq
-;
-; CHECK-AVX-O0-LABEL: atomic_vec1_i16:
-; CHECK-AVX-O0: # %bb.0:
-; CHECK-AVX-O0-NEXT: movw (%rdi), %ax
-; CHECK-AVX-O0-NEXT: retq
%ret = load atomic <1 x i16>, ptr %x acquire, align 2
ret <1 x i16> %ret
}
@@ -119,35 +79,11 @@ define <1 x i32> @atomic_vec1_i8_zext(ptr %x) {
; CHECK-O3-NEXT: movzbl %al, %eax
; CHECK-O3-NEXT: retq
;
-; CHECK-SSE-O3-LABEL: atomic_vec1_i8_zext:
-; CHECK-SSE-O3: # %bb.0:
-; CHECK-SSE-O3-NEXT: movzbl (%rdi), %eax
-; CHECK-SSE-O3-NEXT: movzbl %al, %eax
-; CHECK-SSE-O3-NEXT: retq
-;
-; CHECK-AVX-O3-LABEL: atomic_vec1_i8_zext:
-; CHECK-AVX-O3: # %bb.0:
-; CHECK-AVX-O3-NEXT: movzbl (%rdi), %eax
-; CHECK-AVX-O3-NEXT: movzbl %al, %eax
-; CHECK-AVX-O3-NEXT: retq
-;
; CHECK-O0-LABEL: atomic_vec1_i8_zext:
; CHECK-O0: # %bb.0:
; CHECK-O0-NEXT: movb (%rdi), %al
; CHECK-O0-NEXT: movzbl %al, %eax
; CHECK-O0-NEXT: retq
-;
-; CHECK-SSE-O0-LABEL: atomic_vec1_i8_zext:
-; CHECK-SSE-O0: # %bb.0:
-; CHECK-SSE-O0-NEXT: movb (%rdi), %al
-; CHECK-SSE-O0-NEXT: movzbl %al, %eax
-; CHECK-SSE-O0-NEXT: retq
-;
-; CHECK-AVX-O0-LABEL: atomic_vec1_i8_zext:
-; CHECK-AVX-O0: # %bb.0:
-; CHECK-AVX-O0-NEXT: movb (%rdi), %al
-; CHECK-AVX-O0-NEXT: movzbl %al, %eax
-; CHECK-AVX-O0-NEXT: retq
%ret = load atomic <1 x i8>, ptr %x acquire, align 1
%zret = zext <1 x i8> %ret to <1 x i32>
ret <1 x i32> %zret
@@ -160,35 +96,11 @@ define <1 x i64> @atomic_vec1_i16_sext(ptr %x) {
; CHECK-O3-NEXT: movswq %ax, %rax
; CHECK-O3-NEXT: retq
;
-; CHECK-SSE-O3-LABEL: atomic_vec1_i16_sext:
-; CHECK-SSE-O3: # %bb.0:
-; CHECK-SSE-O3-NEXT: movzwl (%rdi), %eax
-; CHECK-SSE-O3-NEXT: movswq %ax, %rax
-; CHECK-SSE-O3-NEXT: retq
-;
-; CHECK-AVX-O3-LABEL: atomic_vec1_i16_sext:
-; CHECK-AVX-O3: # %bb.0:
-; CHECK-AVX-O3-NEXT: movzwl (%rdi), %eax
-; CHECK-AVX-O3-NEXT: movswq %ax, %rax
-; CHECK-AVX-O3-NEXT: retq
-;
; CHECK-O0-LABEL: atomic_vec1_i16_sext:
; CHECK-O0: # %bb.0:
; CHECK-O0-NEXT: movw (%rdi), %ax
; CHECK-O0-NEXT: movswq %ax, %rax
; CHECK-O0-NEXT: retq
-;
-; CHECK-SSE-O0-LABEL: atomic_vec1_i16_sext:
-; CHECK-SSE-O0: # %bb.0:
-; CHECK-SSE-O0-NEXT: movw (%rdi), %ax
-; CHECK-SSE-O0-NEXT: movswq %ax, %rax
-; CHECK-SSE-O0-NEXT: retq
-;
-; CHECK-AVX-O0-LABEL: atomic_vec1_i16_sext:
-; CHECK-AVX-O0: # %bb.0:
-; CHECK-AVX-O0-NEXT: movw (%rdi), %ax
-; CHECK-AVX-O0-NEXT: movswq %ax, %rax
-; CHECK-AVX-O0-NEXT: retq
%ret = load atomic <1 x i16>, ptr %x acquire, align 2
%sret = sext <1 x i16> %ret to <1 x i64>
ret <1 x i64> %sret
@@ -204,12 +116,6 @@ define <1 x ptr addrspace(270)> @atomic_vec1_ptr270(ptr %x) {
}
define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
-; CHECK-O3-LABEL: atomic_vec1_bfloat:
-; CHECK-O3: # %bb.0:
-; CHECK-O3-NEXT: movzwl (%rdi), %eax
-; CHECK-O3-NEXT: pinsrw $0, %eax, %xmm0
-; CHECK-O3-NEXT: retq
-;
; CHECK-SSE-O3-LABEL: atomic_vec1_bfloat:
; CHECK-SSE-O3: # %bb.0:
; CHECK-SSE-O3-NEXT: movzwl (%rdi), %eax
@@ -222,15 +128,6 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
; CHECK-AVX-O3-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
; CHECK-AVX-O3-NEXT: retq
;
-; CHECK-O0-LABEL: atomic_vec1_bfloat:
-; CHECK-O0: # %bb.0:
-; CHECK-O0-NEXT: movw (%rdi), %cx
-; CHECK-O0-NEXT: # implicit-def: $eax
-; CHECK-O0-NEXT: movw %cx, %ax
-; CHECK-O0-NEXT: # implicit-def: $xmm0
-; CHECK-O0-NEXT: pinsrw $0, %eax, %xmm0
-; CHECK-O0-NEXT: retq
-;
; CHECK-SSE-O0-LABEL: atomic_vec1_bfloat:
; CHECK-SSE-O0: # %bb.0:
; CHECK-SSE-O0-NEXT: movw (%rdi), %cx
@@ -283,30 +180,6 @@ define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind {
; CHECK-O3-NEXT: popq %rcx
; CHECK-O3-NEXT: retq
;
-; CHECK-SSE-O3-LABEL: atomic_vec1_ptr:
-; CHECK-SSE-O3: # %bb.0:
-; CHECK-SSE-O3-NEXT: pushq %rax
-; CHECK-SSE-O3-NEXT: movq %rdi, %rsi
-; CHECK-SSE-O3-NEXT: movq %rsp, %rdx
-; CHECK-SSE-O3-NEXT: movl $8, %edi
-; CHECK-SSE-O3-NEXT: movl $2, %ecx
-; CHECK-SSE-O3-NEXT: callq __atomic_load@PLT
-; CHECK-SSE-O3-NEXT: movq (%rsp), %rax
-; CHECK-SSE-O3-NEXT: popq %rcx
-; CHECK-SSE-O3-NEXT: retq
-;
-; CHECK-AVX-O3-LABEL: atomic_vec1_ptr:
-; CHECK-AVX-O3: # %bb.0:
-; CHECK-AVX-O3-NEXT: pushq %rax
-; CHECK-AVX-O3-NEXT: movq %rdi, %rsi
-; CHECK-AVX-O3-NEXT: movq %rsp, %rdx
-; CHECK-AVX-O3-NEXT: movl $8, %edi
-; CHECK-AVX-O3-NEXT: movl $2, %ecx
-; CHECK-AVX-O3-NEXT: callq __atomic_load@PLT
-; CHECK-AVX-O3-NEXT: movq (%rsp), %rax
-; CHECK-AVX-O3-NEXT: popq %rcx
-; CHECK-AVX-O3-NEXT: retq
-;
; CHECK-O0-LABEL: atomic_vec1_ptr:
; CHECK-O0: # %bb.0:
; CHECK-O0-NEXT: pushq %rax
@@ -318,41 +191,11 @@ define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind {
; CHECK-O0-NEXT: movq (%rsp), %rax
; CHECK-O0-NEXT: popq %rcx
; CHECK-O0-NEXT: retq
-;
-; CHECK-SSE-O0-LABEL: atomic_vec1_ptr:
-; CHECK-SSE-O0: # %bb.0:
-; CHECK-SSE-O0-NEXT: pushq %rax
-; CHECK-SSE-O0-NEXT: movq %rdi, %rsi
-; CHECK-SSE-O0-NEXT: movl $8, %edi
-; CHECK-SSE-O0-NEXT: movq %rsp, %rdx
-; CHECK-SSE-O0-NEXT: movl $2, %ecx
-; CHECK-SSE-O0-NEXT: callq __atomic_load@PLT
-; CHECK-SSE-O0-NEXT: movq (%rsp), %rax
-; CHECK-SSE-O0-NEXT: popq %rcx
-; CHECK-SSE-O0-NEXT: retq
-;
-; CHECK-AVX-O0-LABEL: atomic_vec1_ptr:
-; CHECK-AVX-O0: # %bb.0:
-; CHECK-AVX-O0-NEXT: pushq %rax
-; CHECK-AVX-O0-NEXT: movq %rdi, %rsi
-; CHECK-AVX-O0-NEXT: movl $8, %edi
-; CHECK-AVX-O0-NEXT: movq %rsp, %rdx
-; CHECK-AVX-O0-NEXT: movl $2, %ecx
-; CHECK-AVX-O0-NEXT: callq __atomic_load@PLT
-; CHECK-AVX-O0-NEXT: movq (%rsp), %rax
-; CHECK-AVX-O0-NEXT: popq %rcx
-; CHECK-AVX-O0-NEXT: retq
%ret = load atomic <1 x ptr>, ptr %x acquire, align 4
ret <1 x ptr> %ret
}
define <1 x half> @atomic_vec1_half(ptr %x) {
-; CHECK-O3-LABEL: atomic_vec1_half:
-; CHECK-O3: # %bb.0:
-; CHECK-O3-NEXT: movzwl (%rdi), %eax
-; CHECK-O3-NEXT: pinsrw $0, %eax, %xmm0
-; CHECK-O3-NEXT: retq
-;
; CHECK-SSE-O3-LABEL: atomic_vec1_half:
; CHECK-SSE-O3: # %bb.0:
; CHECK-SSE-O3-NEXT: movzwl (%rdi), %eax
@@ -365,15 +208,6 @@ define <1 x half> @atomic_vec1_half(ptr %x) {
; CHECK-AVX-O3-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
; CHECK-AVX-O3-NEXT: retq
;
-; CHECK-O0-LABEL: atomic_vec1_half:
-; CHECK-O0: # %bb.0:
-; CHECK-O0-NEXT: movw (%rdi), %cx
-; CHECK-O0-NEXT: # implicit-def: $eax
-; CHECK-O0-NEXT: movw %cx, %ax
-; CHECK-O0-NEXT: # implicit-def: $xmm0
-; CHECK-O0-NEXT: pinsrw $0, %eax, %xmm0
-; CHECK-O0-NEXT: retq
-;
; CHECK-SSE-O0-LABEL: atomic_vec1_half:
; CHECK-SSE-O0: # %bb.0:
; CHECK-SSE-O0-NEXT: movw (%rdi), %cx
@@ -396,11 +230,6 @@ define <1 x half> @atomic_vec1_half(ptr %x) {
}
define <1 x float> @atomic_vec1_float(ptr %x) {
-; CHECK-O3-LABEL: atomic_vec1_float:
-; CHECK-O3: # %bb.0:
-; CHECK-O3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-O3-NEXT: retq
-;
; CHECK-SSE-O3-LABEL: atomic_vec1_float:
; CHECK-SSE-O3: # %bb.0:
; CHECK-SSE-O3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -411,11 +240,6 @@ define <1 x float> @atomic_vec1_float(ptr %x) {
; CHECK-AVX-O3-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-AVX-O3-NEXT: retq
;
-; CHECK-O0-LABEL: atomic_vec1_float:
-; CHECK-O0: # %bb.0:
-; CHECK-O0-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-O0-NEXT: retq
-;
; CHECK-SSE-O0-LABEL: atomic_vec1_float:
; CHECK-SSE-O0: # %bb.0:
; CHECK-SSE-O0-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -430,11 +254,6 @@ define <1 x float> @atomic_vec1_float(ptr %x) {
}
define <1 x double> @atomic_vec1_double_align(ptr %x) nounwind {
-; CHECK-O3-LABEL: atomic_vec1_double_align:
-; CHECK-O3: # %bb.0:
-; CHECK-O3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-O3-NEXT: retq
-;
; CHECK-SSE-O3-LABEL: atomic_vec1_double_align:
; CHECK-SSE-O3: # %bb.0:
; CHECK-SSE-O3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
@@ -445,11 +264,6 @@ define <1 x double> @atomic_vec1_double_align(ptr %x) nounwind {
; CHECK-AVX-O3-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-AVX-O3-NEXT: retq
;
-; CHECK-O0-LABEL: atomic_vec1_double_align:
-; CHECK-O0: # %bb.0:
-; CHECK-O0-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-O0-NEXT: retq
-;
; CHECK-SSE-O0-LABEL: atomic_vec1_double_align:
; CHECK-SSE-O0: # %bb.0:
; CHECK-SSE-O0-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
@@ -476,30 +290,6 @@ define <1 x i64> @atomic_vec1_i64(ptr %x) nounwind {
; CHECK-O3-NEXT: popq %rcx
; CHECK-O3-NEXT: retq
;
-; CHECK-SSE-O3-LABEL: atomic_vec1_i64:
-; CHECK-SSE-O3: # %bb.0:
-; CHECK-SSE-O3-NEXT: pushq %rax
-; CHECK-SSE-O3-NEXT: movq %rdi, %rsi
-; CHECK-SSE-O3-NEXT: movq %rsp, %rdx
-; CHECK-SSE-O3-NEXT: movl $8, %edi
-; CHECK-SSE-O3-NEXT: movl $2, %ecx
-; CHECK-SSE-O3-NEXT: callq __atomic_load@PLT
-; CHECK-SSE-O3-NEXT: movq (%rsp), %rax
-; CHECK-SSE-O3-NEXT: popq %rcx
-; CHECK-SSE-O3-NEXT: retq
-;
-; CHECK-AVX-O3-LABEL: atomic_vec1_i64:
-; CHECK-AVX-O3: # %bb.0:
-; CHECK-AVX-O3-NEXT: pushq %rax
-; CHECK-AVX-O3-NEXT: movq %rdi, %rsi
-; CHECK-AVX-O3-NEXT: movq %rsp, %rdx
-; CHECK-AVX-O3-NEXT: movl $8, %edi
-; CHECK-AVX-O3-NEXT: movl $2, %ecx
-; CHECK-AVX-O3-NEXT: callq __atomic_load@PLT
-; CHECK-AVX-O3-NEXT: movq (%rsp), %rax
-; CHECK-AVX-O3-NEXT: popq %rcx
-; CHECK-AVX-O3-NEXT: retq
-;
; CHECK-O0-LABEL: atomic_vec1_i64:
; CHECK-O0: # %bb.0:
; CHECK-O0-NEXT: pushq %rax
@@ -511,47 +301,11 @@ define <1 x i64> @atomic_vec1_i64(ptr %x) nounwind {
; CHECK-O0-NEXT: movq (%rsp), %rax
; CHECK-O0-NEXT: popq %rcx
; CHECK-O0-NEXT: retq
-;
-; CHECK-SSE-O0-LABEL: atomic_vec1_i64:
-; CHECK-SSE-O0: # %bb.0:
-; CHECK-SSE-O0-NEXT: pushq %rax
-; CHECK-SSE-O0-NEXT: movq %rdi, %rsi
-; CHECK-SSE-O0-NEXT: movl $8, %edi
-; CHECK-SSE-O0-NEXT: movq %rsp, %rdx
-; CHECK-SSE-O0-NEXT: movl $2, %ecx
-; CHECK-SSE-O0-NEXT: callq __atomic_load@PLT
-; CHECK-SSE-O0-NEXT: movq (%rsp), %rax
-; CHECK-SSE-O0-NEXT: popq %rcx
-; CHECK-SSE-O0-NEXT: retq
-;
-; CHECK-AVX-O0-LABEL: atomic_vec1_i64:
-; CHECK-AVX-O0: # %bb.0:
-; CHECK-AVX-O0-NEXT: pushq %rax
-; CHECK-AVX-O0-NEXT: movq %rdi, %rsi
-; CHECK-AVX-O0-NEXT: movl $8, %edi
-; CHECK-AVX-O0-NEXT: movq %rsp, %rdx
-; CHECK-AVX-O0-NEXT: movl $2, %ecx
-; CHECK-AVX-O0-NEXT: callq __atomic_load@PLT
-; CHECK-AVX-O0-NEXT: movq (%rsp), %rax
-; CHECK-AVX-O0-NEXT: popq %rcx
-; CHECK-AVX-O0-NEXT: retq
%ret = load atomic <1 x i64>, ptr %x acquire, align 4
ret <1 x i64> %ret
}
define <1 x double> @atomic_vec1_double(ptr %x) nounwind {
-; CHECK-O3-LABEL: atomic_vec1_double:
-; CHECK-O3: # %bb.0:
-; CHECK-O3-NEXT: pushq %rax
-; CHECK-O3-NEXT: movq %rdi, %rsi
-; CHECK-O3-NEXT: movq %rsp, %rdx
-; CHECK-O3-NEXT: movl $8, %edi
-; CHECK-O3-NEXT: movl $2, %ecx
-; CHECK-O3-NEXT: callq __atomic_load@PLT
-; CHECK-O3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-O3-NEXT: popq %rax
-; CHECK-O3-NEXT: retq
-;
; CHECK-SSE-O3-LABEL: atomic_vec1_double:
; CHECK-SSE-O3: # %bb.0:
; CHECK-SSE-O3-NEXT: pushq %rax
@@ -576,18 +330,6 @@ define <1 x double> @atomic_vec1_double(ptr %x) nounwind {
; CHECK-AVX-O3-NEXT: popq %rax
; CHECK-AVX-O3-NEXT: retq
;
-; CHECK-O0-LABEL: atomic_vec1_double:
-; CHECK-O0: # %bb.0:
-; CHECK-O0-NEXT: pushq %rax
-; CHECK-O0-NEXT: movq %rdi, %rsi
-; CHECK-O0-NEXT: movl $8, %edi
-; CHECK-O0-NEXT: movq %rsp, %rdx
-; CHECK-O0-NEXT: movl $2, %ecx
-; CHECK-O0-NEXT: callq __atomic_load@PLT
-; CHECK-O0-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-O0-NEXT: popq %rax
-; CHECK-O0-NEXT: retq
-;
; CHECK-SSE-O0-LABEL: atomic_vec1_double:
; CHECK-SSE-O0: # %bb.0:
; CHECK-SSE-O0-NEXT: pushq %rax
@@ -616,18 +358,6 @@ define <1 x double> @atomic_vec1_double(ptr %x) nounwind {
}
define <2 x i32> @atomic_vec2_i32(ptr %x) nounwind {
-; CHECK-O3-LABEL: atomic_vec2_i32:
-; CHECK-O3: # %bb.0:
-; CHECK-O3-NEXT: pushq %rax
-; CHECK-O3-NEXT: movq %rdi, %rsi
-; CHECK-O3-NEXT: movq %rsp, %rdx
-; CHECK-O3-NEXT: movl $8, %edi
-; CHECK-O3-NEXT: movl $2, %ecx
-; CHECK-O3-NEXT: callq __atomic_load@PLT
-; CHECK-O3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-O3-NEXT: popq %rax
-; CHECK-O3-NEXT: retq
-;
; CHECK-SSE-O3-LABEL: atomic_vec2_i32:
; CHECK-SSE-O3: # %bb.0:
; CHECK-SSE-O3-NEXT: pushq %rax
@@ -652,18 +382,6 @@ define <2 x i32> @atomic_vec2_i32(ptr %x) nounwind {
; CHECK-AVX-O3-NEXT: popq %rax
; CHECK-AVX-O3-NEXT: retq
;
-; CHECK-O0-LABEL: atomic_vec2_i32:
-; CHECK-O0: # %bb.0:
-; CHECK-O0-NEXT: pushq %rax
-; CHECK-O0-NEXT: movq %rdi, %rsi
-; CHECK-O0-NEXT: movl $8, %edi
-; CHECK-O0-NEXT: movq %rsp, %rdx
-; CHECK-O0-NEXT: movl $2, %ecx
-; CHECK-O0-NEXT: callq __atomic_load@PLT
-; CHECK-O0-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; CHECK-O0-NEXT: popq %rax
-; CHECK-O0-NEXT: retq
-;
; CHECK-SSE-O0-LABEL: atomic_vec2_i32:
; CHECK-SSE-O0: # %bb.0:
; CHECK-SSE-O0-NEXT: pushq %rax
@@ -692,18 +410,6 @@ define <2 x i32> @atomic_vec2_i32(ptr %x) nounwind {
}
define <4 x float> @atomic_vec4_float(ptr %x) nounwind {
-; CHECK-O3-LABEL: atomic_vec4_float:
-; CHECK-O3: # %bb.0:
-; CHECK-O3-NEXT: subq $24, %rsp
-; CHECK-O3-NEXT: movq %rdi, %rsi
-; CHECK-O3-NEXT: movq %rsp, %rdx
-; CHECK-O3-NEXT: movl $16, %edi
-; CHECK-O3-NEXT: movl $2, %ecx
-; CHECK-O3-NEXT: callq __atomic_load@PLT
-; CHECK-O3-NEXT: movaps (%rsp), %xmm0
-; CHECK-O3-NEXT: addq $24, %rsp
-; CHECK-O3-NEXT: retq
-;
; CHECK-SSE-O3-LABEL: atomic_vec4_float:
; CHECK-SSE-O3: # %bb.0:
; CHECK-SSE-O3-NEXT: subq $24, %rsp
@@ -728,18 +434,6 @@ define <4 x float> @atomic_vec4_float(ptr %x) nounwind {
; CHECK-AVX-O3-NEXT: addq $24, %rsp
; CHECK-AVX-O3-NEXT: retq
;
-; CHECK-O0-LABEL: atomic_vec4_float:
-; CHECK-O0: # %bb.0:
-; CHECK-O0-NEXT: subq $24, %rsp
-; CHECK-O0-NEXT: movq %rdi, %rsi
-; CHECK-O0-NEXT: movl $16, %edi
-; CHECK-O0-NEXT: movq %rsp, %rdx
-; CHECK-O0-NEXT: movl $2, %ecx
-; CHECK-O0-NEXT: callq __atomic_load@PLT
-; CHECK-O0-NEXT: movaps (%rsp), %xmm0
-; CHECK-O0-NEXT: addq $24, %rsp
-; CHECK-O0-NEXT: retq
-;
; CHECK-SSE-O0-LABEL: atomic_vec4_float:
; CHECK-SSE-O0: # %bb.0:
; CHECK-SSE-O0-NEXT: subq $24, %rsp
@@ -768,21 +462,6 @@ define <4 x float> @atomic_vec4_float(ptr %x) nounwind {
}
define <8 x double> @atomic_vec8_double(ptr %x) nounwind {
-; CHECK-O3-LABEL: atomic_vec8_double:
-; CHECK-O3: # %bb.0:
-; CHECK-O3-NEXT: subq $72, %rsp
-; CHECK-O3-NEXT: movq %rdi, %rsi
-; CHECK-O3-NEXT: movq %rsp, %rdx
-; CHECK-O3-NEXT: movl $64, %edi
-; CHECK-O3-NEXT: movl $2, %ecx
-; CHECK-O3-NEXT: callq __atomic_load@PLT
-; CHECK-O3-NEXT: movaps (%rsp), %xmm0
-; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2
-; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
-; CHECK-O3-NEXT: addq $72, %rsp
-; CHECK-O3-NEXT: retq
-;
; CHECK-SSE-O3-LABEL: atomic_vec8_double:
; CHECK-SSE-O3: # %bb.0:
; CHECK-SSE-O3-NEXT: subq $72, %rsp
@@ -798,20 +477,30 @@ define <8 x double> @atomic_vec8_double(ptr %x) nounwind {
; CHECK-SSE-O3-NEXT: addq $72, %rsp
; CHECK-SSE-O3-NEXT: retq
;
-; CHECK-O0-LABEL: atomic_vec8_double:
-; CHECK-O0: # %bb.0:
-; CHECK-O0-NEXT: subq $72, %rsp
-; CHECK-O0-NEXT: movq %rdi, %rsi
-; CHECK-O0-NEXT: movl $64, %edi
-; CHECK-O0-NEXT: movq %rsp, %rdx
-; CHECK-O0-NEXT: movl $2, %ecx
-; CHECK-O0-NEXT: callq __atomic_load@PLT
-; CHECK-O0-NEXT: movapd (%rsp), %xmm0
-; CHECK-O0-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1
-; CHECK-O0-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2
-; CHECK-O0-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3
-; CHECK-O0-NEXT: addq $72, %rsp
-; CHECK-O0-NEXT: retq
+; CHECK-AVX2-O3-LABEL: atomic_vec8_double:
+; CHECK-AVX2-O3: # %bb.0:
+; CHECK-AVX2-O3-NEXT: subq $72, %rsp
+; CHECK-AVX2-O3-NEXT: movq %rdi, %rsi
+; CHECK-AVX2-O3-NEXT: movq %rsp, %rdx
+; CHECK-AVX2-O3-NEXT: movl $64, %edi
+; CHECK-AVX2-O3-NEXT: movl $2, %ecx
+; CHECK-AVX2-O3-NEXT: callq __atomic_load@PLT
+; CHECK-AVX2-O3-NEXT: vmovups (%rsp), %ymm0
+; CHECK-AVX2-O3-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm1
+; CHECK-AVX2-O3-NEXT: addq $72, %rsp
+; CHECK-AVX2-O3-NEXT: retq
+;
+; CHECK-AVX512-O3-LABEL: atomic_vec8_double:
+; CHECK-AVX512-O3: # %bb.0:
+; CHECK-AVX512-O3-NEXT: subq $72, %rsp
+; CHECK-AVX512-O3-NEXT: movq %rdi, %rsi
+; CHECK-AVX512-O3-NEXT: movq %rsp, %rdx
+; CHECK-AVX512-O3-NEXT: movl $64, %edi
+; CHECK-AVX512-O3-NEXT: movl $2, %ecx
+; CHECK-AVX512-O3-NEXT: callq __atomic_load@PLT
+; CHECK-AVX512-O3-NEXT: vmovups (%rsp), %zmm0
+; CHECK-AVX512-O3-NEXT: addq $72, %rsp
+; CHECK-AVX512-O3-NEXT: retq
;
; CHECK-SSE-O0-LABEL: atomic_vec8_double:
; CHECK-SSE-O0: # %bb.0:
@@ -827,24 +516,36 @@ define <8 x double> @atomic_vec8_double(ptr %x) nounwind {
; CHECK-SSE-O0-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3
; CHECK-SSE-O0-NEXT: addq $72, %rsp
; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX2-O0-LABEL: atomic_vec8_double:
+; CHECK-AVX2-O0: # %bb.0:
+; CHECK-AVX2-O0-NEXT: subq $72, %rsp
+; CHECK-AVX2-O0-NEXT: movq %rdi, %rsi
+; CHECK-AVX2-O0-NEXT: movl $64, %edi
+; CHECK-AVX2-O0-NEXT: movq %rsp, %rdx
+; CHECK-AVX2-O0-NEXT: movl $2, %ecx
+; CHECK-AVX2-O0-NEXT: callq __atomic_load@PLT
+; CHECK-AVX2-O0-NEXT: vmovupd (%rsp), %ymm0
+; CHECK-AVX2-O0-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm1
+; CHECK-AVX2-O0-NEXT: addq $72, %rsp
+; CHECK-AVX2-O0-NEXT: retq
+;
+; CHECK-AVX512-O0-LABEL: atomic_vec8_double:
+; CHECK-AVX512-O0: # %bb.0:
+; CHECK-AVX512-O0-NEXT: subq $72, %rsp
+; CHECK-AVX512-O0-NEXT: movq %rdi, %rsi
+; CHECK-AVX512-O0-NEXT: movl $64, %edi
+; CHECK-AVX512-O0-NEXT: movq %rsp, %rdx
+; CHECK-AVX512-O0-NEXT: movl $2, %ecx
+; CHECK-AVX512-O0-NEXT: callq __atomic_load@PLT
+; CHECK-AVX512-O0-NEXT: vmovupd (%rsp), %zmm0
+; CHECK-AVX512-O0-NEXT: addq $72, %rsp
+; CHECK-AVX512-O0-NEXT: retq
%ret = load atomic <8 x double>, ptr %x acquire, align 4
ret <8 x double> %ret
}
define <16 x bfloat> @atomic_vec16_bfloat(ptr %x) nounwind {
-; CHECK-O3-LABEL: atomic_vec16_bfloat:
-; CHECK-O3: # %bb.0:
-; CHECK-O3-NEXT: subq $40, %rsp
-; CHECK-O3-NEXT: movq %rdi, %rsi
-; CHECK-O3-NEXT: movq %rsp, %rdx
-; CHECK-O3-NEXT: movl $32, %edi
-; CHECK-O3-NEXT: movl $2, %ecx
-; CHECK-O3-NEXT: callq __atomic_load@PLT
-; CHECK-O3-NEXT: movaps (%rsp), %xmm0
-; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; CHECK-O3-NEXT: addq $40, %rsp
-; CHECK-O3-NEXT: retq
-;
; CHECK-SSE-O3-LABEL: atomic_vec16_bfloat:
; CHECK-SSE-O3: # %bb.0:
; CHECK-SSE-O3-NEXT: subq $40, %rsp
@@ -870,19 +571,6 @@ define <16 x bfloat> @atomic_vec16_bfloat(ptr %x) nounwind {
; CHECK-AVX-O3-NEXT: addq $40, %rsp
; CHECK-AVX-O3-NEXT: retq
;
-; CHECK-O0-LABEL: atomic_vec16_bfloat:
-; CHECK-O0: # %bb.0:
-; CHECK-O0-NEXT: subq $40, %rsp
-; CHECK-O0-NEXT: movq %rdi, %rsi
-; CHECK-O0-NEXT: movl $32, %edi
-; CHECK-O0-NEXT: movq %rsp, %rdx
-; CHECK-O0-NEXT: movl $2, %ecx
-; CHECK-O0-NEXT: callq __atomic_load@PLT
-; CHECK-O0-NEXT: movaps (%rsp), %xmm0
-; CHECK-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; CHECK-O0-NEXT: addq $40, %rsp
-; CHECK-O0-NEXT: retq
-;
; CHECK-SSE-O0-LABEL: atomic_vec16_bfloat:
; CHECK-SSE-O0: # %bb.0:
; CHECK-SSE-O0-NEXT: subq $40, %rsp
@@ -912,21 +600,6 @@ define <16 x bfloat> @atomic_vec16_bfloat(ptr %x) nounwind {
}
define <32 x half> @atomic_vec32_half(ptr %x) nounwind {
-; CHECK-O3-LABEL: atomic_vec32_half:
-; CHECK-O3: # %bb.0:
-; CHECK-O3-NEXT: subq $72, %rsp
-; CHECK-O3-NEXT: movq %rdi, %rsi
-; CHECK-O3-NEXT: movq %rsp, %rdx
-; CHECK-O3-NEXT: movl $64, %edi
-; CHECK-O3-NEXT: movl $2, %ecx
-; CHECK-O3-NEXT: callq __atomic_load@PLT
-; CHECK-O3-NEXT: movaps (%rsp), %xmm0
-; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2
-; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
-; CHECK-O3-NEXT: addq $72, %rsp
-; CHECK-O3-NEXT: retq
-;
; CHECK-SSE-O3-LABEL: atomic_vec32_half:
; CHECK-SSE-O3: # %bb.0:
; CHECK-SSE-O3-NEXT: subq $72, %rsp
@@ -942,20 +615,30 @@ define <32 x half> @atomic_vec32_half(ptr %x) nounwind {
; CHECK-SSE-O3-NEXT: addq $72, %rsp
; CHECK-SSE-O3-NEXT: retq
;
-; CHECK-O0-LABEL: atomic_vec32_half:
-; CHECK-O0: # %bb.0:
-; CHECK-O0-NEXT: subq $72, %rsp
-; CHECK-O0-NEXT: movq %rdi, %rsi
-; CHECK-O0-NEXT: movl $64, %edi
-; CHECK-O0-NEXT: movq %rsp, %rdx
-; CHECK-O0-NEXT: movl $2, %ecx
-; CHECK-O0-NEXT: callq __atomic_load@PLT
-; CHECK-O0-NEXT: movaps (%rsp), %xmm0
-; CHECK-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; CHECK-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2
-; CHECK-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
-; CHECK-O0-NEXT: addq $72, %rsp
-; CHECK-O0-NEXT: retq
+; CHECK-AVX2-O3-LABEL: atomic_vec32_half:
+; CHECK-AVX2-O3: # %bb.0:
+; CHECK-AVX2-O3-NEXT: subq $72, %rsp
+; CHECK-AVX2-O3-NEXT: movq %rdi, %rsi
+; CHECK-AVX2-O3-NEXT: movq %rsp, %rdx
+; CHECK-AVX2-O3-NEXT: movl $64, %edi
+; CHECK-AVX2-O3-NEXT: movl $2, %ecx
+; CHECK-AVX2-O3-NEXT: callq __atomic_load@PLT
+; CHECK-AVX2-O3-NEXT: vmovups (%rsp), %ymm0
+; CHECK-AVX2-O3-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm1
+; CHECK-AVX2-O3-NEXT: addq $72, %rsp
+; CHECK-AVX2-O3-NEXT: retq
+;
+; CHECK-AVX512-O3-LABEL: atomic_vec32_half:
+; CHECK-AVX512-O3: # %bb.0:
+; CHECK-AVX512-O3-NEXT: subq $72, %rsp
+; CHECK-AVX512-O3-NEXT: movq %rdi, %rsi
+; CHECK-AVX512-O3-NEXT: movq %rsp, %rdx
+; CHECK-AVX512-O3-NEXT: movl $64, %edi
+; CHECK-AVX512-O3-NEXT: movl $2, %ecx
+; CHECK-AVX512-O3-NEXT: callq __atomic_load@PLT
+; CHECK-AVX512-O3-NEXT: vmovups (%rsp), %zmm0
+; CHECK-AVX512-O3-NEXT: addq $72, %rsp
+; CHECK-AVX512-O3-NEXT: retq
;
; CHECK-SSE-O0-LABEL: atomic_vec32_half:
; CHECK-SSE-O0: # %bb.0:
@@ -971,6 +654,31 @@ define <32 x half> @atomic_vec32_half(ptr %x) nounwind {
; CHECK-SSE-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
; CHECK-SSE-O0-NEXT: addq $72, %rsp
; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX2-O0-LABEL: atomic_vec32_half:
+; CHECK-AVX2-O0: # %bb.0:
+; CHECK-AVX2-O0-NEXT: subq $72, %rsp
+; CHECK-AVX2-O0-NEXT: movq %rdi, %rsi
+; CHECK-AVX2-O0-NEXT: movl $64, %edi
+; CHECK-AVX2-O0-NEXT: movq %rsp, %rdx
+; CHECK-AVX2-O0-NEXT: movl $2, %ecx
+; CHECK-AVX2-O0-NEXT: callq __atomic_load@PLT
+; CHECK-AVX2-O0-NEXT: vmovups (%rsp), %ymm0
+; CHECK-AVX2-O0-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm1
+; CHECK-AVX2-O0-NEXT: addq $72, %rsp
+; CHECK-AVX2-O0-NEXT: retq
+;
+; CHECK-AVX512-O0-LABEL: atomic_vec32_half:
+; CHECK-AVX512-O0: # %bb.0:
+; CHECK-AVX512-O0-NEXT: subq $72, %rsp
+; CHECK-AVX512-O0-NEXT: movq %rdi, %rsi
+; CHECK-AVX512-O0-NEXT: movl $64, %edi
+; CHECK-AVX512-O0-NEXT: movq %rsp, %rdx
+; CHECK-AVX512-O0-NEXT: movl $2, %ecx
+; CHECK-AVX512-O0-NEXT: callq __atomic_load@PLT
+; CHECK-AVX512-O0-NEXT: vmovups (%rsp), %zmm0
+; CHECK-AVX512-O0-NEXT: addq $72, %rsp
+; CHECK-AVX512-O0-NEXT: retq
%ret = load atomic <32 x half>, ptr %x acquire, align 4
ret <32 x half> %ret
}
diff --git a/llvm/test/CodeGen/X86/bfloat-calling-conv.ll b/llvm/test/CodeGen/X86/bfloat-calling-conv.ll
index ea4d32b..d087491 100644
--- a/llvm/test/CodeGen/X86/bfloat-calling-conv.ll
+++ b/llvm/test/CodeGen/X86/bfloat-calling-conv.ll
@@ -660,8 +660,7 @@ define <3 x bfloat> @call_ret_v3bf16(ptr %ptr) #0 {
; SSE2-LABEL: call_ret_v3bf16:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rax
-; SSE2-NEXT: movl 4(%rdi), %eax
-; SSE2-NEXT: pinsrw $0, %eax, %xmm1
+; SSE2-NEXT: pinsrw $0, 4(%rdi), %xmm1
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: callq returns_v3bf16@PLT
@@ -725,8 +724,7 @@ define <3 x bfloat> @call_ret_v3bf16(ptr %ptr) #0 {
; AVXNECONVERT-LABEL: call_ret_v3bf16:
; AVXNECONVERT: # %bb.0:
; AVXNECONVERT-NEXT: pushq %rax
-; AVXNECONVERT-NEXT: movl 4(%rdi), %eax
-; AVXNECONVERT-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVXNECONVERT-NEXT: vpinsrw $0, 4(%rdi), %xmm0, %xmm0
; AVXNECONVERT-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVXNECONVERT-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
; AVXNECONVERT-NEXT: callq returns_v3bf16@PLT
diff --git a/llvm/test/CodeGen/X86/trunc-srl-load.ll b/llvm/test/CodeGen/X86/trunc-srl-load.ll
index 4dae143..d9c21d3 100644
--- a/llvm/test/CodeGen/X86/trunc-srl-load.ll
+++ b/llvm/test/CodeGen/X86/trunc-srl-load.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=X64,SSE
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX,AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64
; Tests showing for the analysis of non-constant shift amounts to improve load address math
@@ -12,42 +12,20 @@
define i16 @extractSub64_16(ptr %word, i32 %idx) nounwind {
; X86-LABEL: extractSub64_16:
; X86: # %bb.0:
-; X86-NEXT: pushl %esi
-; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: movl 4(%eax), %esi
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: andb $16, %cl
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: shrl %cl, %eax
-; X86-NEXT: shrdl %cl, %esi, %edx
-; X86-NEXT: testb $32, %ch
-; X86-NEXT: jne .LBB0_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: .LBB0_2:
-; X86-NEXT: # kill: def $ax killed $ax killed $eax
-; X86-NEXT: popl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: andl $48, %ecx
+; X86-NEXT: shrl $3, %ecx
+; X86-NEXT: movzwl (%eax,%ecx), %eax
; X86-NEXT: retl
;
-; SSE-LABEL: extractSub64_16:
-; SSE: # %bb.0:
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: movq (%rdi), %rax
-; SSE-NEXT: andb $48, %cl
-; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT: shrq %cl, %rax
-; SSE-NEXT: # kill: def $ax killed $ax killed $rax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: extractSub64_16:
-; AVX: # %bb.0:
-; AVX-NEXT: # kill: def $esi killed $esi def $rsi
-; AVX-NEXT: andb $48, %sil
-; AVX-NEXT: shrxq %rsi, (%rdi), %rax
-; AVX-NEXT: # kill: def $ax killed $ax killed $rax
-; AVX-NEXT: retq
+; X64-LABEL: extractSub64_16:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $esi killed $esi def $rsi
+; X64-NEXT: andl $48, %esi
+; X64-NEXT: shrl $3, %esi
+; X64-NEXT: movzwl (%rdi,%rsi), %eax
+; X64-NEXT: retq
%idx_bounds = and i32 %idx, 63
%idx_align = and i32 %idx_bounds, -16
%sh = zext nneg i32 %idx_align to i64
@@ -60,67 +38,20 @@ define i16 @extractSub64_16(ptr %word, i32 %idx) nounwind {
define i16 @extractSub128_16(ptr %word, i32 %idx) nounwind {
; X86-LABEL: extractSub128_16:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movzbl 12(%ebp), %eax
-; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: movl (%ecx), %edx
-; X86-NEXT: movl 4(%ecx), %esi
-; X86-NEXT: movl 8(%ecx), %edi
-; X86-NEXT: movl 12(%ecx), %ecx
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, (%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: andb $16, %cl
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $12, %al
-; X86-NEXT: movzbl %al, %edx
-; X86-NEXT: movl (%esp,%edx), %eax
-; X86-NEXT: movl 4(%esp,%edx), %edx
-; X86-NEXT: shrdl %cl, %edx, %eax
-; X86-NEXT: # kill: def $ax killed $ax killed $eax
-; X86-NEXT: leal -8(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: andl $112, %ecx
+; X86-NEXT: shrl $3, %ecx
+; X86-NEXT: movzwl (%eax,%ecx), %eax
; X86-NEXT: retl
;
-; SSE-LABEL: extractSub128_16:
-; SSE: # %bb.0:
-; SSE-NEXT: movq (%rdi), %rax
-; SSE-NEXT: movq 8(%rdi), %rdx
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: andb $48, %cl
-; SSE-NEXT: movq %rdx, %rdi
-; SSE-NEXT: shrq %cl, %rdi
-; SSE-NEXT: shrdq %cl, %rdx, %rax
-; SSE-NEXT: testb $64, %sil
-; SSE-NEXT: cmovneq %rdi, %rax
-; SSE-NEXT: # kill: def $ax killed $ax killed $rax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: extractSub128_16:
-; AVX: # %bb.0:
-; AVX-NEXT: movq (%rdi), %rdx
-; AVX-NEXT: movq 8(%rdi), %rax
-; AVX-NEXT: movl %esi, %ecx
-; AVX-NEXT: andb $48, %cl
-; AVX-NEXT: shrdq %cl, %rax, %rdx
-; AVX-NEXT: shrxq %rcx, %rax, %rax
-; AVX-NEXT: testb $64, %sil
-; AVX-NEXT: cmoveq %rdx, %rax
-; AVX-NEXT: # kill: def $ax killed $ax killed $rax
-; AVX-NEXT: retq
+; X64-LABEL: extractSub128_16:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $esi killed $esi def $rsi
+; X64-NEXT: andl $112, %esi
+; X64-NEXT: shrl $3, %esi
+; X64-NEXT: movzwl (%rdi,%rsi), %eax
+; X64-NEXT: retq
%idx_bounds = and i32 %idx, 127
%idx_align = and i32 %idx_bounds, -16
%sh = zext nneg i32 %idx_align to i128
@@ -133,62 +64,20 @@ define i16 @extractSub128_16(ptr %word, i32 %idx) nounwind {
define i32 @extractSub128_32(ptr %word, i32 %idx) nounwind {
; X86-LABEL: extractSub128_32:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movzbl 12(%ebp), %eax
-; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: movl (%ecx), %edx
-; X86-NEXT: movl 4(%ecx), %esi
-; X86-NEXT: movl 8(%ecx), %edi
-; X86-NEXT: movl 12(%ecx), %ecx
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, (%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: andb $96, %al
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movl (%esp,%eax), %eax
-; X86-NEXT: leal -8(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: andl $96, %ecx
+; X86-NEXT: shrl $3, %ecx
+; X86-NEXT: movl (%eax,%ecx), %eax
; X86-NEXT: retl
;
-; SSE-LABEL: extractSub128_32:
-; SSE: # %bb.0:
-; SSE-NEXT: movq (%rdi), %rax
-; SSE-NEXT: movq 8(%rdi), %rdx
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: andb $32, %cl
-; SSE-NEXT: movq %rdx, %rdi
-; SSE-NEXT: shrq %cl, %rdi
-; SSE-NEXT: shrdq %cl, %rdx, %rax
-; SSE-NEXT: testb $64, %sil
-; SSE-NEXT: cmovneq %rdi, %rax
-; SSE-NEXT: # kill: def $eax killed $eax killed $rax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: extractSub128_32:
-; AVX: # %bb.0:
-; AVX-NEXT: movq (%rdi), %rdx
-; AVX-NEXT: movq 8(%rdi), %rax
-; AVX-NEXT: movl %esi, %ecx
-; AVX-NEXT: andb $32, %cl
-; AVX-NEXT: shrdq %cl, %rax, %rdx
-; AVX-NEXT: shrxq %rcx, %rax, %rax
-; AVX-NEXT: testb $64, %sil
-; AVX-NEXT: cmoveq %rdx, %rax
-; AVX-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX-NEXT: retq
+; X64-LABEL: extractSub128_32:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $esi killed $esi def $rsi
+; X64-NEXT: andl $96, %esi
+; X64-NEXT: shrl $3, %esi
+; X64-NEXT: movl (%rdi,%rsi), %eax
+; X64-NEXT: retq
%idx_bounds = and i32 %idx, 127
%idx_align = and i32 %idx_bounds, -32
%sh = zext nneg i32 %idx_align to i128
@@ -201,46 +90,20 @@ define i32 @extractSub128_32(ptr %word, i32 %idx) nounwind {
define i64 @extractSub128_64(ptr %word, i32 %idx) nounwind {
; X86-LABEL: extractSub128_64:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movzbl 12(%ebp), %eax
-; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: movl (%ecx), %edx
-; X86-NEXT: movl 4(%ecx), %esi
-; X86-NEXT: movl 8(%ecx), %edi
-; X86-NEXT: movl 12(%ecx), %ecx
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, (%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: andb $64, %al
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: movzbl %al, %ecx
-; X86-NEXT: movl (%esp,%ecx), %eax
-; X86-NEXT: movl 4(%esp,%ecx), %edx
-; X86-NEXT: leal -8(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: andl $64, %edx
+; X86-NEXT: shrl $3, %edx
+; X86-NEXT: movl (%ecx,%edx), %eax
+; X86-NEXT: movl 4(%ecx,%edx), %edx
; X86-NEXT: retl
;
; X64-LABEL: extractSub128_64:
; X64: # %bb.0:
-; X64-NEXT: testb $64, %sil
-; X64-NEXT: je .LBB3_1
-; X64-NEXT: # %bb.2:
-; X64-NEXT: movq 8(%rdi), %rax
-; X64-NEXT: retq
-; X64-NEXT: .LBB3_1:
-; X64-NEXT: movq (%rdi), %rax
+; X64-NEXT: # kill: def $esi killed $esi def $rsi
+; X64-NEXT: andl $64, %esi
+; X64-NEXT: shrl $3, %esi
+; X64-NEXT: movq (%rdi,%rsi), %rax
; X64-NEXT: retq
%idx_bounds = and i32 %idx, 127
%idx_align = and i32 %idx_bounds, -64
@@ -254,185 +117,20 @@ define i64 @extractSub128_64(ptr %word, i32 %idx) nounwind {
define i8 @extractSub512_8(ptr %word, i32 %idx) nounwind {
; X86-LABEL: extractSub512_8:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $192, %esp
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl (%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 4(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 24(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%eax), %ebx
-; X86-NEXT: movl 44(%eax), %edi
-; X86-NEXT: movl 48(%eax), %esi
-; X86-NEXT: movl 52(%eax), %edx
-; X86-NEXT: movl 56(%eax), %ecx
-; X86-NEXT: movl 60(%eax), %eax
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 12(%ebp), %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: andl $24, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrl $3, %edx
-; X86-NEXT: andl $60, %edx
-; X86-NEXT: movl 48(%esp,%edx), %eax
-; X86-NEXT: movl 52(%esp,%edx), %edx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shrdl %cl, %edx, %eax
-; X86-NEXT: # kill: def $al killed $al killed $eax
-; X86-NEXT: leal -12(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: shrl $3, %ecx
+; X86-NEXT: andl $63, %ecx
+; X86-NEXT: movzbl (%eax,%ecx), %eax
; X86-NEXT: retl
;
-; SSE-LABEL: extractSub512_8:
-; SSE: # %bb.0:
-; SSE-NEXT: pushq %rax
-; SSE-NEXT: # kill: def $esi killed $esi def $rsi
-; SSE-NEXT: movups (%rdi), %xmm0
-; SSE-NEXT: movups 16(%rdi), %xmm1
-; SSE-NEXT: movups 32(%rdi), %xmm2
-; SSE-NEXT: movups 48(%rdi), %xmm3
-; SSE-NEXT: xorps %xmm4, %xmm4
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: andl $56, %ecx
-; SSE-NEXT: shrl $3, %esi
-; SSE-NEXT: andl $56, %esi
-; SSE-NEXT: movq -128(%rsp,%rsi), %rdx
-; SSE-NEXT: shrq %cl, %rdx
-; SSE-NEXT: movl -120(%rsp,%rsi), %eax
-; SSE-NEXT: addl %eax, %eax
-; SSE-NEXT: notl %ecx
-; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT: shlq %cl, %rax
-; SSE-NEXT: orl %edx, %eax
-; SSE-NEXT: # kill: def $al killed $al killed $rax
-; SSE-NEXT: popq %rcx
-; SSE-NEXT: retq
-;
-; AVX2-LABEL: extractSub512_8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rax
-; AVX2-NEXT: # kill: def $esi killed $esi def $rsi
-; AVX2-NEXT: vmovups (%rdi), %ymm0
-; AVX2-NEXT: vmovups 32(%rdi), %ymm1
-; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: andl $56, %ecx
-; AVX2-NEXT: shrl $3, %esi
-; AVX2-NEXT: andl $56, %esi
-; AVX2-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rax
-; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT: notl %ecx
-; AVX2-NEXT: movl -120(%rsp,%rsi), %edx
-; AVX2-NEXT: addl %edx, %edx
-; AVX2-NEXT: shlxq %rcx, %rdx, %rcx
-; AVX2-NEXT: orl %ecx, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $rax
-; AVX2-NEXT: popq %rcx
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: extractSub512_8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rax
-; AVX512-NEXT: vmovups (%rdi), %ymm0
-; AVX512-NEXT: vmovups 32(%rdi), %ymm1
-; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: # kill: def $esi killed $esi def $rsi
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: andl $56, %ecx
-; AVX512-NEXT: shrl $3, %esi
-; AVX512-NEXT: andl $56, %esi
-; AVX512-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rax
-; AVX512-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX512-NEXT: notl %ecx
-; AVX512-NEXT: movl -120(%rsp,%rsi), %edx
-; AVX512-NEXT: addl %edx, %edx
-; AVX512-NEXT: shlxq %rcx, %rdx, %rcx
-; AVX512-NEXT: orl %ecx, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $rax
-; AVX512-NEXT: popq %rcx
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; X64-LABEL: extractSub512_8:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $esi killed $esi def $rsi
+; X64-NEXT: shrl $3, %esi
+; X64-NEXT: andl $63, %esi
+; X64-NEXT: movzbl (%rdi,%rsi), %eax
+; X64-NEXT: retq
%idx_bounds = and i32 %idx, 511
%idx_align = and i32 %idx_bounds, -8
%ld = load i512, ptr %word, align 8
@@ -445,152 +143,21 @@ define i8 @extractSub512_8(ptr %word, i32 %idx) nounwind {
define i64 @extractSub512_64(ptr %word, i32 %idx) nounwind {
; X86-LABEL: extractSub512_64:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $192, %esp
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl (%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 4(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 24(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%eax), %ebx
-; X86-NEXT: movl 44(%eax), %edi
-; X86-NEXT: movl 48(%eax), %esi
-; X86-NEXT: movl 52(%eax), %edx
-; X86-NEXT: movl 56(%eax), %ecx
-; X86-NEXT: movl 60(%eax), %eax
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrl $3, %ecx
-; X86-NEXT: andl $56, %ecx
-; X86-NEXT: movl 48(%esp,%ecx), %eax
-; X86-NEXT: movl 52(%esp,%ecx), %edx
-; X86-NEXT: leal -12(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: shrl $3, %edx
+; X86-NEXT: andl $56, %edx
+; X86-NEXT: movl (%ecx,%edx), %eax
+; X86-NEXT: movl 4(%ecx,%edx), %edx
; X86-NEXT: retl
;
-; SSE-LABEL: extractSub512_64:
-; SSE: # %bb.0:
-; SSE-NEXT: pushq %rax
-; SSE-NEXT: # kill: def $esi killed $esi def $rsi
-; SSE-NEXT: movups (%rdi), %xmm0
-; SSE-NEXT: movups 16(%rdi), %xmm1
-; SSE-NEXT: movups 32(%rdi), %xmm2
-; SSE-NEXT: movups 48(%rdi), %xmm3
-; SSE-NEXT: xorps %xmm4, %xmm4
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: shrl $3, %esi
-; SSE-NEXT: andl $56, %esi
-; SSE-NEXT: movq -128(%rsp,%rsi), %rax
-; SSE-NEXT: popq %rcx
-; SSE-NEXT: retq
-;
-; AVX2-LABEL: extractSub512_64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rax
-; AVX2-NEXT: vmovups (%rdi), %ymm0
-; AVX2-NEXT: vmovups 32(%rdi), %ymm1
-; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: # kill: def $esi killed $esi def $rsi
-; AVX2-NEXT: shrl $3, %esi
-; AVX2-NEXT: andl $56, %esi
-; AVX2-NEXT: movq -128(%rsp,%rsi), %rax
-; AVX2-NEXT: popq %rcx
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: extractSub512_64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rax
-; AVX512-NEXT: vmovups (%rdi), %ymm0
-; AVX512-NEXT: vmovups 32(%rdi), %ymm1
-; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: # kill: def $esi killed $esi def $rsi
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: shrl $3, %esi
-; AVX512-NEXT: andl $56, %esi
-; AVX512-NEXT: movq -128(%rsp,%rsi), %rax
-; AVX512-NEXT: popq %rcx
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; X64-LABEL: extractSub512_64:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $esi killed $esi def $rsi
+; X64-NEXT: shrl $3, %esi
+; X64-NEXT: andl $56, %esi
+; X64-NEXT: movq (%rdi,%rsi), %rax
+; X64-NEXT: retq
%idx_bounds = and i32 %idx, 511
%idx_align = and i32 %idx_bounds, -64
%sh = zext nneg i32 %idx_align to i512
@@ -603,143 +170,35 @@ define i64 @extractSub512_64(ptr %word, i32 %idx) nounwind {
define i128 @extractSub512_128(ptr %word, i32 %idx) nounwind {
; X86-LABEL: extractSub512_128:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $192, %esp
-; X86-NEXT: movl 12(%ebp), %eax
-; X86-NEXT: movl (%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 4(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 24(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%eax), %ebx
-; X86-NEXT: movl 44(%eax), %edi
-; X86-NEXT: movl 48(%eax), %esi
-; X86-NEXT: movl 52(%eax), %edx
-; X86-NEXT: movl 56(%eax), %ecx
-; X86-NEXT: movl 60(%eax), %eax
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 16(%ebp), %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrl $3, %edi
-; X86-NEXT: andl $48, %edi
-; X86-NEXT: movl 48(%esp,%edi), %ecx
-; X86-NEXT: movl 52(%esp,%edi), %edx
-; X86-NEXT: movl 56(%esp,%edi), %esi
-; X86-NEXT: movl 60(%esp,%edi), %edi
-; X86-NEXT: movl %edi, 12(%eax)
-; X86-NEXT: movl %esi, 8(%eax)
-; X86-NEXT: movl %edx, 4(%eax)
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: shrl $3, %edx
+; X86-NEXT: andl $48, %edx
+; X86-NEXT: movl (%ecx,%edx), %esi
+; X86-NEXT: movl 4(%ecx,%edx), %edi
+; X86-NEXT: movl 8(%ecx,%edx), %ebx
+; X86-NEXT: movl 12(%ecx,%edx), %ecx
+; X86-NEXT: movl %ecx, 12(%eax)
+; X86-NEXT: movl %ebx, 8(%eax)
+; X86-NEXT: movl %edi, 4(%eax)
+; X86-NEXT: movl %esi, (%eax)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl $4
;
-; SSE-LABEL: extractSub512_128:
-; SSE: # %bb.0:
-; SSE-NEXT: pushq %rax
-; SSE-NEXT: # kill: def $esi killed $esi def $rsi
-; SSE-NEXT: movups (%rdi), %xmm0
-; SSE-NEXT: movups 16(%rdi), %xmm1
-; SSE-NEXT: movups 32(%rdi), %xmm2
-; SSE-NEXT: movups 48(%rdi), %xmm3
-; SSE-NEXT: xorps %xmm4, %xmm4
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: shrl $3, %esi
-; SSE-NEXT: andl $48, %esi
-; SSE-NEXT: movq -128(%rsp,%rsi), %rax
-; SSE-NEXT: movq -120(%rsp,%rsi), %rdx
-; SSE-NEXT: popq %rcx
-; SSE-NEXT: retq
-;
-; AVX-LABEL: extractSub512_128:
-; AVX: # %bb.0:
-; AVX-NEXT: pushq %rax
-; AVX-NEXT: # kill: def $esi killed $esi def $rsi
-; AVX-NEXT: vmovups (%rdi), %ymm0
-; AVX-NEXT: vmovups 32(%rdi), %ymm1
-; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: shrl $3, %esi
-; AVX-NEXT: andl $48, %esi
-; AVX-NEXT: movq -128(%rsp,%rsi), %rax
-; AVX-NEXT: movq -120(%rsp,%rsi), %rdx
-; AVX-NEXT: popq %rcx
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; X64-LABEL: extractSub512_128:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $esi killed $esi def $rsi
+; X64-NEXT: shrl $3, %esi
+; X64-NEXT: andl $48, %esi
+; X64-NEXT: movq (%rdi,%rsi), %rax
+; X64-NEXT: movq 8(%rdi,%rsi), %rdx
+; X64-NEXT: retq
%idx_bounds = and i32 %idx, 511
%idx_align = and i32 %idx_bounds, -128
%sh = zext nneg i32 %idx_align to i512
@@ -752,916 +211,21 @@ define i128 @extractSub512_128(ptr %word, i32 %idx) nounwind {
define i64 @extractSub4096_64(ptr %word, i32 %idx) nounwind {
; X86-LABEL: extractSub4096_64:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $1536, %esp # imm = 0x600
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl 4(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 24(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 48(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 52(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 56(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 60(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 64(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 68(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 72(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 76(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 80(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 84(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 88(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 92(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 96(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 100(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 104(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 108(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 112(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 116(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 120(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 124(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 128(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 132(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 136(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 140(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 144(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 148(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 152(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 156(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 160(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 164(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 168(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 172(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 176(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 180(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 184(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 188(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 192(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 196(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 200(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 204(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 208(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 212(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 216(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 220(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 224(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 228(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 232(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 236(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 240(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 244(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 248(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 252(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 256(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 260(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 264(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 268(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 272(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 276(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 280(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 284(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 288(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 292(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 296(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 300(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 304(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 308(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 312(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 316(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 320(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 324(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 328(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 332(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 336(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 340(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 344(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 348(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 352(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 356(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 360(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 364(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 368(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 372(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 376(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 380(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 384(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 388(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 392(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 396(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 400(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 404(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 408(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 412(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 416(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 420(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 424(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 428(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 432(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 436(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 440(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 444(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 448(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 452(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 456(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 460(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 464(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 468(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 472(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 476(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 480(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 484(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 488(%eax), %ebx
-; X86-NEXT: movl 492(%eax), %edi
-; X86-NEXT: movl 496(%eax), %esi
-; X86-NEXT: movl 500(%eax), %edx
-; X86-NEXT: movl 504(%eax), %ecx
-; X86-NEXT: movl 508(%eax), %eax
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $4032, %ecx # imm = 0xFC0
-; X86-NEXT: andl 12(%ebp), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrl $3, %ecx
-; X86-NEXT: movl 496(%esp,%ecx), %eax
-; X86-NEXT: movl 500(%esp,%ecx), %edx
-; X86-NEXT: leal -12(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $4032, %edx # imm = 0xFC0
+; X86-NEXT: andl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: shrl $3, %edx
+; X86-NEXT: movl (%ecx,%edx), %eax
+; X86-NEXT: movl 4(%ecx,%edx), %edx
; X86-NEXT: retl
;
-; SSE-LABEL: extractSub4096_64:
-; SSE: # %bb.0:
-; SSE-NEXT: subq $1176, %rsp # imm = 0x498
-; SSE-NEXT: # kill: def $esi killed $esi def $rsi
-; SSE-NEXT: movups (%rdi), %xmm0
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movups 16(%rdi), %xmm0
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movups 32(%rdi), %xmm0
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movups 48(%rdi), %xmm0
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movups 64(%rdi), %xmm0
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movups 80(%rdi), %xmm0
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movups 96(%rdi), %xmm0
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movups 112(%rdi), %xmm0
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movups 128(%rdi), %xmm0
-; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
-; SSE-NEXT: movups 144(%rdi), %xmm0
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movups 160(%rdi), %xmm0
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movups 176(%rdi), %xmm0
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movups 192(%rdi), %xmm0
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movups 208(%rdi), %xmm0
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movups 224(%rdi), %xmm0
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movups 240(%rdi), %xmm0
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movups 256(%rdi), %xmm0
-; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movups 272(%rdi), %xmm15
-; SSE-NEXT: movups 288(%rdi), %xmm14
-; SSE-NEXT: movups 304(%rdi), %xmm13
-; SSE-NEXT: movups 320(%rdi), %xmm12
-; SSE-NEXT: movups 336(%rdi), %xmm11
-; SSE-NEXT: movups 352(%rdi), %xmm10
-; SSE-NEXT: movups 368(%rdi), %xmm9
-; SSE-NEXT: movups 384(%rdi), %xmm8
-; SSE-NEXT: movups 400(%rdi), %xmm7
-; SSE-NEXT: movups 416(%rdi), %xmm6
-; SSE-NEXT: movups 432(%rdi), %xmm5
-; SSE-NEXT: movups 448(%rdi), %xmm4
-; SSE-NEXT: movups 464(%rdi), %xmm3
-; SSE-NEXT: movups 480(%rdi), %xmm2
-; SSE-NEXT: movups 496(%rdi), %xmm1
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm8, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm9, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm10, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm11, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm12, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm13, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm14, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm15, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: andl $4032, %esi # imm = 0xFC0
-; SSE-NEXT: shrl $3, %esi
-; SSE-NEXT: movq 144(%rsp,%rsi), %rax
-; SSE-NEXT: addq $1176, %rsp # imm = 0x498
-; SSE-NEXT: retq
-;
-; AVX2-LABEL: extractSub4096_64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: subq $936, %rsp # imm = 0x3A8
-; AVX2-NEXT: vmovups (%rdi), %ymm0
-; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovups 32(%rdi), %ymm1
-; AVX2-NEXT: vmovups 64(%rdi), %ymm2
-; AVX2-NEXT: vmovups 96(%rdi), %ymm3
-; AVX2-NEXT: vmovups 128(%rdi), %ymm4
-; AVX2-NEXT: vmovups 160(%rdi), %ymm5
-; AVX2-NEXT: vmovups 192(%rdi), %ymm6
-; AVX2-NEXT: vmovups 224(%rdi), %ymm7
-; AVX2-NEXT: vmovups 256(%rdi), %ymm8
-; AVX2-NEXT: vmovups 288(%rdi), %ymm9
-; AVX2-NEXT: vmovups 320(%rdi), %ymm10
-; AVX2-NEXT: vmovups 352(%rdi), %ymm11
-; AVX2-NEXT: vmovups 384(%rdi), %ymm12
-; AVX2-NEXT: vmovups 416(%rdi), %ymm13
-; AVX2-NEXT: vmovups 448(%rdi), %ymm14
-; AVX2-NEXT: vmovups 480(%rdi), %ymm15
-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm15, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm14, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm13, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm12, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm11, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm10, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm9, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm8, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm7, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm6, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm5, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm4, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm3, (%rsp)
-; AVX2-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: # kill: def $esi killed $esi def $rsi
-; AVX2-NEXT: andl $4032, %esi # imm = 0xFC0
-; AVX2-NEXT: shrl $3, %esi
-; AVX2-NEXT: movq -96(%rsp,%rsi), %rax
-; AVX2-NEXT: addq $936, %rsp # imm = 0x3A8
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: extractSub4096_64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: subq $904, %rsp # imm = 0x388
-; AVX512-NEXT: # kill: def $esi killed $esi def $rsi
-; AVX512-NEXT: vmovups (%rdi), %ymm0
-; AVX512-NEXT: vmovups 32(%rdi), %ymm1
-; AVX512-NEXT: vmovups 64(%rdi), %ymm2
-; AVX512-NEXT: vmovups 96(%rdi), %ymm3
-; AVX512-NEXT: vmovups 128(%rdi), %ymm4
-; AVX512-NEXT: vmovups 160(%rdi), %ymm5
-; AVX512-NEXT: vmovups 192(%rdi), %ymm6
-; AVX512-NEXT: vmovups 224(%rdi), %ymm7
-; AVX512-NEXT: vmovups 256(%rdi), %ymm8
-; AVX512-NEXT: vmovups 288(%rdi), %ymm9
-; AVX512-NEXT: vmovups 320(%rdi), %ymm10
-; AVX512-NEXT: vmovups 352(%rdi), %ymm11
-; AVX512-NEXT: vmovups 384(%rdi), %ymm12
-; AVX512-NEXT: vmovups 416(%rdi), %ymm13
-; AVX512-NEXT: andl $4032, %esi # imm = 0xFC0
-; AVX512-NEXT: vmovups 448(%rdi), %ymm14
-; AVX512-NEXT: vmovups 480(%rdi), %ymm15
-; AVX512-NEXT: vxorps %xmm16, %xmm16, %xmm16
-; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm16, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm15, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm14, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm13, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm12, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm11, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm10, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm9, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm8, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm7, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm6, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm5, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm4, (%rsp)
-; AVX512-NEXT: vmovups %ymm3, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: shrl $3, %esi
-; AVX512-NEXT: movq -128(%rsp,%rsi), %rax
-; AVX512-NEXT: addq $904, %rsp # imm = 0x388
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; X64-LABEL: extractSub4096_64:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $esi killed $esi def $rsi
+; X64-NEXT: andl $4032, %esi # imm = 0xFC0
+; X64-NEXT: shrl $3, %esi
+; X64-NEXT: movq (%rdi,%rsi), %rax
+; X64-NEXT: retq
%idx_bounds = and i32 %idx, 4095
%idx_align = and i32 %idx_bounds, -64
%sh = zext nneg i32 %idx_align to i4096
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
index ab1feba..9816fa7 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
@@ -992,6 +992,51 @@ define i1 @signtest_v8i32(<8 x i32> %a0) {
ret i1 %2
}
+define i1 @signtest_v4i64(<4 x i64> %a0) {
+; SSE2-LABEL: signtest_v4i64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT: por %xmm0, %xmm1
+; SSE2-NEXT: movq %xmm1, %rax
+; SSE2-NEXT: testq %rax, %rax
+; SSE2-NEXT: setns %al
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: signtest_v4i64:
+; SSE41: # %bb.0:
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: sete %al
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: signtest_v4i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0
+; AVX1-NEXT: sete %al
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: signtest_v4i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vptest %ymm1, %ymm0
+; AVX2-NEXT: sete %al
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: signtest_v4i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX512-NEXT: vptest %ymm1, %ymm0
+; AVX512-NEXT: sete %al
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %1 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %a0)
+ %2 = icmp sgt i64 %1, -1
+ ret i1 %2
+}
+
define i1 @trunc_v16i16(<16 x i16> %a0) {
; SSE2-LABEL: trunc_v16i16:
; SSE2: # %bb.0:
@@ -1162,11 +1207,11 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: orl %ecx, %eax
; SSE2-NEXT: testb $1, %al
-; SSE2-NEXT: je .LBB29_2
+; SSE2-NEXT: je .LBB30_2
; SSE2-NEXT: # %bb.1:
; SSE2-NEXT: xorl %eax, %eax
; SSE2-NEXT: retq
-; SSE2-NEXT: .LBB29_2:
+; SSE2-NEXT: .LBB30_2:
; SSE2-NEXT: movl $1, %eax
; SSE2-NEXT: retq
;
@@ -1181,11 +1226,11 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {
; SSE41-NEXT: pextrd $2, %xmm1, %eax
; SSE41-NEXT: orl %ecx, %eax
; SSE41-NEXT: testb $1, %al
-; SSE41-NEXT: je .LBB29_2
+; SSE41-NEXT: je .LBB30_2
; SSE41-NEXT: # %bb.1:
; SSE41-NEXT: xorl %eax, %eax
; SSE41-NEXT: retq
-; SSE41-NEXT: .LBB29_2:
+; SSE41-NEXT: .LBB30_2:
; SSE41-NEXT: movl $1, %eax
; SSE41-NEXT: retq
;
@@ -1200,11 +1245,11 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {
; AVX1OR2-NEXT: vpextrd $2, %xmm0, %eax
; AVX1OR2-NEXT: orl %ecx, %eax
; AVX1OR2-NEXT: testb $1, %al
-; AVX1OR2-NEXT: je .LBB29_2
+; AVX1OR2-NEXT: je .LBB30_2
; AVX1OR2-NEXT: # %bb.1:
; AVX1OR2-NEXT: xorl %eax, %eax
; AVX1OR2-NEXT: retq
-; AVX1OR2-NEXT: .LBB29_2:
+; AVX1OR2-NEXT: .LBB30_2:
; AVX1OR2-NEXT: movl $1, %eax
; AVX1OR2-NEXT: retq
;
@@ -1219,12 +1264,12 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {
; AVX512F-NEXT: korw %k0, %k1, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: testb $1, %al
-; AVX512F-NEXT: je .LBB29_2
+; AVX512F-NEXT: je .LBB30_2
; AVX512F-NEXT: # %bb.1:
; AVX512F-NEXT: xorl %eax, %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
-; AVX512F-NEXT: .LBB29_2:
+; AVX512F-NEXT: .LBB30_2:
; AVX512F-NEXT: movl $1, %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -1240,12 +1285,12 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: testb $1, %al
-; AVX512BW-NEXT: je .LBB29_2
+; AVX512BW-NEXT: je .LBB30_2
; AVX512BW-NEXT: # %bb.1:
; AVX512BW-NEXT: xorl %eax, %eax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
-; AVX512BW-NEXT: .LBB29_2:
+; AVX512BW-NEXT: .LBB30_2:
; AVX512BW-NEXT: movl $1, %eax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -1259,11 +1304,11 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {
; AVX512BWVL-NEXT: korw %k0, %k1, %k0
; AVX512BWVL-NEXT: kmovd %k0, %eax
; AVX512BWVL-NEXT: testb $1, %al
-; AVX512BWVL-NEXT: je .LBB29_2
+; AVX512BWVL-NEXT: je .LBB30_2
; AVX512BWVL-NEXT: # %bb.1:
; AVX512BWVL-NEXT: xorl %eax, %eax
; AVX512BWVL-NEXT: retq
-; AVX512BWVL-NEXT: .LBB29_2:
+; AVX512BWVL-NEXT: .LBB30_2:
; AVX512BWVL-NEXT: movl $1, %eax
; AVX512BWVL-NEXT: retq
%1 = icmp ne <3 x i32> %a, %b
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index 81c4d5d..c3054a3 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -962,39 +962,22 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
}
define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
-; X64-NO-BMI2-NEXT: xorps %xmm1, %xmm1
-; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movl %ecx, %eax
-; X64-NO-BMI2-NEXT: shrb $6, %al
-; X64-NO-BMI2-NEXT: movzbl %al, %eax
-; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rax
-; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-NO-BMI2-NEXT: shrq %cl, %rax
-; X64-NO-BMI2-NEXT: movb %al, (%rdx)
-; X64-NO-BMI2-NEXT: retq
-;
-; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64-BMI2: # %bb.0:
-; X64-BMI2-NEXT: movups (%rdi), %xmm0
-; X64-BMI2-NEXT: xorps %xmm1, %xmm1
-; X64-BMI2-NEXT: shll $3, %esi
-; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movl %esi, %eax
-; X64-BMI2-NEXT: shrb $6, %al
-; X64-BMI2-NEXT: movzbl %al, %eax
-; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rax
-; X64-BMI2-NEXT: movb %al, (%rdx)
-; X64-BMI2-NEXT: retq
+; X64-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64: # %bb.0:
+; X64-NEXT: movups (%rdi), %xmm0
+; X64-NEXT: xorps %xmm1, %xmm1
+; X64-NEXT: leal (,%rsi,8), %eax
+; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: shrb $6, %al
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: leaq -72(%rsp,%rax,8), %rax
+; X64-NEXT: andl $7, %esi
+; X64-NEXT: movzbl (%rsi,%rax), %eax
+; X64-NEXT: movb %al, (%rdx)
+; X64-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X86-NO-BMI2-NO-SHLD: # %bb.0:
@@ -3417,7 +3400,6 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; ALL: {{.*}}
-; X64: {{.*}}
; X64-NO-SHLD: {{.*}}
; X86: {{.*}}
; X86-HAVE-BMI2-HAVE-SHLD: {{.*}}
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
index 8d36eef..84c2cc6 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
@@ -1220,41 +1220,23 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; no @load_16byte_chunk_of_16byte_alloca
define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca:
-; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
-; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
-; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movl %ecx, %eax
-; X64-NO-BMI2-NEXT: shrb $6, %al
-; X64-NO-BMI2-NEXT: movzbl %al, %eax
-; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rax
-; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-NO-BMI2-NEXT: shrq %cl, %rax
-; X64-NO-BMI2-NEXT: movb %al, (%rdx)
-; X64-NO-BMI2-NEXT: retq
-;
-; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca:
-; X64-BMI2: # %bb.0:
-; X64-BMI2-NEXT: movups (%rdi), %xmm0
-; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
-; X64-BMI2-NEXT: shll $3, %esi
-; X64-BMI2-NEXT: xorps %xmm2, %xmm2
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movl %esi, %eax
-; X64-BMI2-NEXT: shrb $6, %al
-; X64-BMI2-NEXT: movzbl %al, %eax
-; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rax
-; X64-BMI2-NEXT: movb %al, (%rdx)
-; X64-BMI2-NEXT: retq
+; X64-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X64: # %bb.0:
+; X64-NEXT: movups (%rdi), %xmm0
+; X64-NEXT: movups 16(%rdi), %xmm1
+; X64-NEXT: leal (,%rsi,8), %eax
+; X64-NEXT: xorps %xmm2, %xmm2
+; X64-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: shrb $6, %al
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: leaq -72(%rsp,%rax,8), %rax
+; X64-NEXT: andl $7, %esi
+; X64-NEXT: movzbl (%rsi,%rax), %eax
+; X64-NEXT: movb %al, (%rdx)
+; X64-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
; X86-NO-BMI2-NO-SHLD: # %bb.0:
@@ -2156,7 +2138,6 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
; no @load_32byte_chunk_of_32byte_alloca
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; ALL: {{.*}}
-; X64: {{.*}}
; X64-NO-SHLD: {{.*}}
; X86: {{.*}}
; X86-NO-SHLD: {{.*}}