diff options
Diffstat (limited to 'llvm/test/CodeGen/X86')
| -rw-r--r-- | llvm/test/CodeGen/X86/and-mask-variable.ll | 12 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/atomic-load-store.ll | 504 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/bfloat-calling-conv.ll | 6 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/trunc-srl-load.ll | 1652 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll | 69 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll | 50 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll | 53 | 
7 files changed, 312 insertions, 2034 deletions
| diff --git a/llvm/test/CodeGen/X86/and-mask-variable.ll b/llvm/test/CodeGen/X86/and-mask-variable.ll index d89f0db..3e5bd69 100644 --- a/llvm/test/CodeGen/X86/and-mask-variable.ll +++ b/llvm/test/CodeGen/X86/and-mask-variable.ll @@ -1,10 +1,10 @@  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=-bmi,-tbm,-bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X86-NOBMI -; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+bmi,+tbm,+bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X86-BMI2 -; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+bmi,-tbm,+bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X86-BMI2 -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=-bmi,-tbm,-bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X64-NOBMI -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+bmi,+tbm,+bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X64-BMI2 -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+bmi,-tbm,+bmi2,+fast-bextr < %s | FileCheck %s --check-prefixes=X64-BMI2 +; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=-bmi,-tbm,-bmi2 < %s | FileCheck %s --check-prefixes=X86-NOBMI +; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+bmi,+tbm,+bmi2 < %s | FileCheck %s --check-prefixes=X86-BMI2 +; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+bmi,-tbm,+bmi2 < %s | FileCheck %s --check-prefixes=X86-BMI2 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=-bmi,-tbm,-bmi2 < %s | FileCheck %s --check-prefixes=X64-NOBMI +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+bmi,+tbm,+bmi2 < %s | FileCheck %s --check-prefixes=X64-BMI2 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+bmi,-tbm,+bmi2 < %s | FileCheck %s --check-prefixes=X64-BMI2  define i32 @mask_pair(i32 %x, i32 %y) nounwind {  ; X86-NOBMI-LABEL: mask_pair: diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll index 3e7b73a..1173c45 100644 --- a/llvm/test/CodeGen/X86/atomic-load-store.ll +++ b/llvm/test/CodeGen/X86/atomic-load-store.ll @@ -1,12 +1,12 @@  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,CHECK-O3 -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-SSE-O3 -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O3 -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O3 -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,CHECK-O0 -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-SSE-O0 -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O0 -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O0 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-SSE-O3 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-SSE-O3 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-AVX-O3,CHECK-AVX2-O3 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-AVX-O3,CHECK-AVX512-O3 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-SSE-O0 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-SSE-O0 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-AVX-O0,CHECK-AVX2-O0 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-AVX-O0,CHECK-AVX512-O0  define void @test1(ptr %ptr, i32 %val1) {  ; CHECK-LABEL: test1: @@ -50,30 +50,10 @@ define <1 x i8> @atomic_vec1_i8(ptr %x) {  ; CHECK-O3-NEXT:    movzbl (%rdi), %eax  ; CHECK-O3-NEXT:    retq  ; -; CHECK-SSE-O3-LABEL: atomic_vec1_i8: -; CHECK-SSE-O3:       # %bb.0: -; CHECK-SSE-O3-NEXT:    movzbl (%rdi), %eax -; CHECK-SSE-O3-NEXT:    retq -; -; CHECK-AVX-O3-LABEL: atomic_vec1_i8: -; CHECK-AVX-O3:       # %bb.0: -; CHECK-AVX-O3-NEXT:    movzbl (%rdi), %eax -; CHECK-AVX-O3-NEXT:    retq -;  ; CHECK-O0-LABEL: atomic_vec1_i8:  ; CHECK-O0:       # %bb.0:  ; CHECK-O0-NEXT:    movb (%rdi), %al  ; CHECK-O0-NEXT:    retq -; -; CHECK-SSE-O0-LABEL: atomic_vec1_i8: -; CHECK-SSE-O0:       # %bb.0: -; CHECK-SSE-O0-NEXT:    movb (%rdi), %al -; CHECK-SSE-O0-NEXT:    retq -; -; CHECK-AVX-O0-LABEL: atomic_vec1_i8: -; CHECK-AVX-O0:       # %bb.0: -; CHECK-AVX-O0-NEXT:    movb (%rdi), %al -; CHECK-AVX-O0-NEXT:    retq    %ret = load atomic <1 x i8>, ptr %x acquire, align 1    ret <1 x i8> %ret  } @@ -84,30 +64,10 @@ define <1 x i16> @atomic_vec1_i16(ptr %x) {  ; CHECK-O3-NEXT:    movzwl (%rdi), %eax  ; CHECK-O3-NEXT:    retq  ; -; CHECK-SSE-O3-LABEL: atomic_vec1_i16: -; CHECK-SSE-O3:       # %bb.0: -; CHECK-SSE-O3-NEXT:    movzwl (%rdi), %eax -; CHECK-SSE-O3-NEXT:    retq -; -; CHECK-AVX-O3-LABEL: atomic_vec1_i16: -; CHECK-AVX-O3:       # %bb.0: -; CHECK-AVX-O3-NEXT:    movzwl (%rdi), %eax -; CHECK-AVX-O3-NEXT:    retq -;  ; CHECK-O0-LABEL: atomic_vec1_i16:  ; CHECK-O0:       # %bb.0:  ; CHECK-O0-NEXT:    movw (%rdi), %ax  ; CHECK-O0-NEXT:    retq -; -; CHECK-SSE-O0-LABEL: atomic_vec1_i16: -; CHECK-SSE-O0:       # %bb.0: -; CHECK-SSE-O0-NEXT:    movw (%rdi), %ax -; CHECK-SSE-O0-NEXT:    retq -; -; CHECK-AVX-O0-LABEL: atomic_vec1_i16: -; CHECK-AVX-O0:       # %bb.0: -; CHECK-AVX-O0-NEXT:    movw (%rdi), %ax -; CHECK-AVX-O0-NEXT:    retq    %ret = load atomic <1 x i16>, ptr %x acquire, align 2    ret <1 x i16> %ret  } @@ -119,35 +79,11 @@ define <1 x i32> @atomic_vec1_i8_zext(ptr %x) {  ; CHECK-O3-NEXT:    movzbl %al, %eax  ; CHECK-O3-NEXT:    retq  ; -; CHECK-SSE-O3-LABEL: atomic_vec1_i8_zext: -; CHECK-SSE-O3:       # %bb.0: -; CHECK-SSE-O3-NEXT:    movzbl (%rdi), %eax -; CHECK-SSE-O3-NEXT:    movzbl %al, %eax -; CHECK-SSE-O3-NEXT:    retq -; -; CHECK-AVX-O3-LABEL: atomic_vec1_i8_zext: -; CHECK-AVX-O3:       # %bb.0: -; CHECK-AVX-O3-NEXT:    movzbl (%rdi), %eax -; CHECK-AVX-O3-NEXT:    movzbl %al, %eax -; CHECK-AVX-O3-NEXT:    retq -;  ; CHECK-O0-LABEL: atomic_vec1_i8_zext:  ; CHECK-O0:       # %bb.0:  ; CHECK-O0-NEXT:    movb (%rdi), %al  ; CHECK-O0-NEXT:    movzbl %al, %eax  ; CHECK-O0-NEXT:    retq -; -; CHECK-SSE-O0-LABEL: atomic_vec1_i8_zext: -; CHECK-SSE-O0:       # %bb.0: -; CHECK-SSE-O0-NEXT:    movb (%rdi), %al -; CHECK-SSE-O0-NEXT:    movzbl %al, %eax -; CHECK-SSE-O0-NEXT:    retq -; -; CHECK-AVX-O0-LABEL: atomic_vec1_i8_zext: -; CHECK-AVX-O0:       # %bb.0: -; CHECK-AVX-O0-NEXT:    movb (%rdi), %al -; CHECK-AVX-O0-NEXT:    movzbl %al, %eax -; CHECK-AVX-O0-NEXT:    retq    %ret = load atomic <1 x i8>, ptr %x acquire, align 1    %zret = zext <1 x i8> %ret to <1 x i32>    ret <1 x i32> %zret @@ -160,35 +96,11 @@ define <1 x i64> @atomic_vec1_i16_sext(ptr %x) {  ; CHECK-O3-NEXT:    movswq %ax, %rax  ; CHECK-O3-NEXT:    retq  ; -; CHECK-SSE-O3-LABEL: atomic_vec1_i16_sext: -; CHECK-SSE-O3:       # %bb.0: -; CHECK-SSE-O3-NEXT:    movzwl (%rdi), %eax -; CHECK-SSE-O3-NEXT:    movswq %ax, %rax -; CHECK-SSE-O3-NEXT:    retq -; -; CHECK-AVX-O3-LABEL: atomic_vec1_i16_sext: -; CHECK-AVX-O3:       # %bb.0: -; CHECK-AVX-O3-NEXT:    movzwl (%rdi), %eax -; CHECK-AVX-O3-NEXT:    movswq %ax, %rax -; CHECK-AVX-O3-NEXT:    retq -;  ; CHECK-O0-LABEL: atomic_vec1_i16_sext:  ; CHECK-O0:       # %bb.0:  ; CHECK-O0-NEXT:    movw (%rdi), %ax  ; CHECK-O0-NEXT:    movswq %ax, %rax  ; CHECK-O0-NEXT:    retq -; -; CHECK-SSE-O0-LABEL: atomic_vec1_i16_sext: -; CHECK-SSE-O0:       # %bb.0: -; CHECK-SSE-O0-NEXT:    movw (%rdi), %ax -; CHECK-SSE-O0-NEXT:    movswq %ax, %rax -; CHECK-SSE-O0-NEXT:    retq -; -; CHECK-AVX-O0-LABEL: atomic_vec1_i16_sext: -; CHECK-AVX-O0:       # %bb.0: -; CHECK-AVX-O0-NEXT:    movw (%rdi), %ax -; CHECK-AVX-O0-NEXT:    movswq %ax, %rax -; CHECK-AVX-O0-NEXT:    retq    %ret = load atomic <1 x i16>, ptr %x acquire, align 2    %sret = sext <1 x i16> %ret to <1 x i64>    ret <1 x i64> %sret @@ -204,12 +116,6 @@ define <1 x ptr addrspace(270)> @atomic_vec1_ptr270(ptr %x) {  }  define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) { -; CHECK-O3-LABEL: atomic_vec1_bfloat: -; CHECK-O3:       # %bb.0: -; CHECK-O3-NEXT:    movzwl (%rdi), %eax -; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm0 -; CHECK-O3-NEXT:    retq -;  ; CHECK-SSE-O3-LABEL: atomic_vec1_bfloat:  ; CHECK-SSE-O3:       # %bb.0:  ; CHECK-SSE-O3-NEXT:    movzwl (%rdi), %eax @@ -222,15 +128,6 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {  ; CHECK-AVX-O3-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0  ; CHECK-AVX-O3-NEXT:    retq  ; -; CHECK-O0-LABEL: atomic_vec1_bfloat: -; CHECK-O0:       # %bb.0: -; CHECK-O0-NEXT:    movw (%rdi), %cx -; CHECK-O0-NEXT:    # implicit-def: $eax -; CHECK-O0-NEXT:    movw %cx, %ax -; CHECK-O0-NEXT:    # implicit-def: $xmm0 -; CHECK-O0-NEXT:    pinsrw $0, %eax, %xmm0 -; CHECK-O0-NEXT:    retq -;  ; CHECK-SSE-O0-LABEL: atomic_vec1_bfloat:  ; CHECK-SSE-O0:       # %bb.0:  ; CHECK-SSE-O0-NEXT:    movw (%rdi), %cx @@ -283,30 +180,6 @@ define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind {  ; CHECK-O3-NEXT:    popq %rcx  ; CHECK-O3-NEXT:    retq  ; -; CHECK-SSE-O3-LABEL: atomic_vec1_ptr: -; CHECK-SSE-O3:       # %bb.0: -; CHECK-SSE-O3-NEXT:    pushq %rax -; CHECK-SSE-O3-NEXT:    movq %rdi, %rsi -; CHECK-SSE-O3-NEXT:    movq %rsp, %rdx -; CHECK-SSE-O3-NEXT:    movl $8, %edi -; CHECK-SSE-O3-NEXT:    movl $2, %ecx -; CHECK-SSE-O3-NEXT:    callq __atomic_load@PLT -; CHECK-SSE-O3-NEXT:    movq (%rsp), %rax -; CHECK-SSE-O3-NEXT:    popq %rcx -; CHECK-SSE-O3-NEXT:    retq -; -; CHECK-AVX-O3-LABEL: atomic_vec1_ptr: -; CHECK-AVX-O3:       # %bb.0: -; CHECK-AVX-O3-NEXT:    pushq %rax -; CHECK-AVX-O3-NEXT:    movq %rdi, %rsi -; CHECK-AVX-O3-NEXT:    movq %rsp, %rdx -; CHECK-AVX-O3-NEXT:    movl $8, %edi -; CHECK-AVX-O3-NEXT:    movl $2, %ecx -; CHECK-AVX-O3-NEXT:    callq __atomic_load@PLT -; CHECK-AVX-O3-NEXT:    movq (%rsp), %rax -; CHECK-AVX-O3-NEXT:    popq %rcx -; CHECK-AVX-O3-NEXT:    retq -;  ; CHECK-O0-LABEL: atomic_vec1_ptr:  ; CHECK-O0:       # %bb.0:  ; CHECK-O0-NEXT:    pushq %rax @@ -318,41 +191,11 @@ define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind {  ; CHECK-O0-NEXT:    movq (%rsp), %rax  ; CHECK-O0-NEXT:    popq %rcx  ; CHECK-O0-NEXT:    retq -; -; CHECK-SSE-O0-LABEL: atomic_vec1_ptr: -; CHECK-SSE-O0:       # %bb.0: -; CHECK-SSE-O0-NEXT:    pushq %rax -; CHECK-SSE-O0-NEXT:    movq %rdi, %rsi -; CHECK-SSE-O0-NEXT:    movl $8, %edi -; CHECK-SSE-O0-NEXT:    movq %rsp, %rdx -; CHECK-SSE-O0-NEXT:    movl $2, %ecx -; CHECK-SSE-O0-NEXT:    callq __atomic_load@PLT -; CHECK-SSE-O0-NEXT:    movq (%rsp), %rax -; CHECK-SSE-O0-NEXT:    popq %rcx -; CHECK-SSE-O0-NEXT:    retq -; -; CHECK-AVX-O0-LABEL: atomic_vec1_ptr: -; CHECK-AVX-O0:       # %bb.0: -; CHECK-AVX-O0-NEXT:    pushq %rax -; CHECK-AVX-O0-NEXT:    movq %rdi, %rsi -; CHECK-AVX-O0-NEXT:    movl $8, %edi -; CHECK-AVX-O0-NEXT:    movq %rsp, %rdx -; CHECK-AVX-O0-NEXT:    movl $2, %ecx -; CHECK-AVX-O0-NEXT:    callq __atomic_load@PLT -; CHECK-AVX-O0-NEXT:    movq (%rsp), %rax -; CHECK-AVX-O0-NEXT:    popq %rcx -; CHECK-AVX-O0-NEXT:    retq    %ret = load atomic <1 x ptr>, ptr %x acquire, align 4    ret <1 x ptr> %ret  }  define <1 x half> @atomic_vec1_half(ptr %x) { -; CHECK-O3-LABEL: atomic_vec1_half: -; CHECK-O3:       # %bb.0: -; CHECK-O3-NEXT:    movzwl (%rdi), %eax -; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm0 -; CHECK-O3-NEXT:    retq -;  ; CHECK-SSE-O3-LABEL: atomic_vec1_half:  ; CHECK-SSE-O3:       # %bb.0:  ; CHECK-SSE-O3-NEXT:    movzwl (%rdi), %eax @@ -365,15 +208,6 @@ define <1 x half> @atomic_vec1_half(ptr %x) {  ; CHECK-AVX-O3-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0  ; CHECK-AVX-O3-NEXT:    retq  ; -; CHECK-O0-LABEL: atomic_vec1_half: -; CHECK-O0:       # %bb.0: -; CHECK-O0-NEXT:    movw (%rdi), %cx -; CHECK-O0-NEXT:    # implicit-def: $eax -; CHECK-O0-NEXT:    movw %cx, %ax -; CHECK-O0-NEXT:    # implicit-def: $xmm0 -; CHECK-O0-NEXT:    pinsrw $0, %eax, %xmm0 -; CHECK-O0-NEXT:    retq -;  ; CHECK-SSE-O0-LABEL: atomic_vec1_half:  ; CHECK-SSE-O0:       # %bb.0:  ; CHECK-SSE-O0-NEXT:    movw (%rdi), %cx @@ -396,11 +230,6 @@ define <1 x half> @atomic_vec1_half(ptr %x) {  }  define <1 x float> @atomic_vec1_float(ptr %x) { -; CHECK-O3-LABEL: atomic_vec1_float: -; CHECK-O3:       # %bb.0: -; CHECK-O3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-O3-NEXT:    retq -;  ; CHECK-SSE-O3-LABEL: atomic_vec1_float:  ; CHECK-SSE-O3:       # %bb.0:  ; CHECK-SSE-O3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -411,11 +240,6 @@ define <1 x float> @atomic_vec1_float(ptr %x) {  ; CHECK-AVX-O3-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero  ; CHECK-AVX-O3-NEXT:    retq  ; -; CHECK-O0-LABEL: atomic_vec1_float: -; CHECK-O0:       # %bb.0: -; CHECK-O0-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-O0-NEXT:    retq -;  ; CHECK-SSE-O0-LABEL: atomic_vec1_float:  ; CHECK-SSE-O0:       # %bb.0:  ; CHECK-SSE-O0-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -430,11 +254,6 @@ define <1 x float> @atomic_vec1_float(ptr %x) {  }  define <1 x double> @atomic_vec1_double_align(ptr %x) nounwind { -; CHECK-O3-LABEL: atomic_vec1_double_align: -; CHECK-O3:       # %bb.0: -; CHECK-O3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-O3-NEXT:    retq -;  ; CHECK-SSE-O3-LABEL: atomic_vec1_double_align:  ; CHECK-SSE-O3:       # %bb.0:  ; CHECK-SSE-O3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero @@ -445,11 +264,6 @@ define <1 x double> @atomic_vec1_double_align(ptr %x) nounwind {  ; CHECK-AVX-O3-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero  ; CHECK-AVX-O3-NEXT:    retq  ; -; CHECK-O0-LABEL: atomic_vec1_double_align: -; CHECK-O0:       # %bb.0: -; CHECK-O0-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-O0-NEXT:    retq -;  ; CHECK-SSE-O0-LABEL: atomic_vec1_double_align:  ; CHECK-SSE-O0:       # %bb.0:  ; CHECK-SSE-O0-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero @@ -476,30 +290,6 @@ define <1 x i64> @atomic_vec1_i64(ptr %x) nounwind {  ; CHECK-O3-NEXT:    popq %rcx  ; CHECK-O3-NEXT:    retq  ; -; CHECK-SSE-O3-LABEL: atomic_vec1_i64: -; CHECK-SSE-O3:       # %bb.0: -; CHECK-SSE-O3-NEXT:    pushq %rax -; CHECK-SSE-O3-NEXT:    movq %rdi, %rsi -; CHECK-SSE-O3-NEXT:    movq %rsp, %rdx -; CHECK-SSE-O3-NEXT:    movl $8, %edi -; CHECK-SSE-O3-NEXT:    movl $2, %ecx -; CHECK-SSE-O3-NEXT:    callq __atomic_load@PLT -; CHECK-SSE-O3-NEXT:    movq (%rsp), %rax -; CHECK-SSE-O3-NEXT:    popq %rcx -; CHECK-SSE-O3-NEXT:    retq -; -; CHECK-AVX-O3-LABEL: atomic_vec1_i64: -; CHECK-AVX-O3:       # %bb.0: -; CHECK-AVX-O3-NEXT:    pushq %rax -; CHECK-AVX-O3-NEXT:    movq %rdi, %rsi -; CHECK-AVX-O3-NEXT:    movq %rsp, %rdx -; CHECK-AVX-O3-NEXT:    movl $8, %edi -; CHECK-AVX-O3-NEXT:    movl $2, %ecx -; CHECK-AVX-O3-NEXT:    callq __atomic_load@PLT -; CHECK-AVX-O3-NEXT:    movq (%rsp), %rax -; CHECK-AVX-O3-NEXT:    popq %rcx -; CHECK-AVX-O3-NEXT:    retq -;  ; CHECK-O0-LABEL: atomic_vec1_i64:  ; CHECK-O0:       # %bb.0:  ; CHECK-O0-NEXT:    pushq %rax @@ -511,47 +301,11 @@ define <1 x i64> @atomic_vec1_i64(ptr %x) nounwind {  ; CHECK-O0-NEXT:    movq (%rsp), %rax  ; CHECK-O0-NEXT:    popq %rcx  ; CHECK-O0-NEXT:    retq -; -; CHECK-SSE-O0-LABEL: atomic_vec1_i64: -; CHECK-SSE-O0:       # %bb.0: -; CHECK-SSE-O0-NEXT:    pushq %rax -; CHECK-SSE-O0-NEXT:    movq %rdi, %rsi -; CHECK-SSE-O0-NEXT:    movl $8, %edi -; CHECK-SSE-O0-NEXT:    movq %rsp, %rdx -; CHECK-SSE-O0-NEXT:    movl $2, %ecx -; CHECK-SSE-O0-NEXT:    callq __atomic_load@PLT -; CHECK-SSE-O0-NEXT:    movq (%rsp), %rax -; CHECK-SSE-O0-NEXT:    popq %rcx -; CHECK-SSE-O0-NEXT:    retq -; -; CHECK-AVX-O0-LABEL: atomic_vec1_i64: -; CHECK-AVX-O0:       # %bb.0: -; CHECK-AVX-O0-NEXT:    pushq %rax -; CHECK-AVX-O0-NEXT:    movq %rdi, %rsi -; CHECK-AVX-O0-NEXT:    movl $8, %edi -; CHECK-AVX-O0-NEXT:    movq %rsp, %rdx -; CHECK-AVX-O0-NEXT:    movl $2, %ecx -; CHECK-AVX-O0-NEXT:    callq __atomic_load@PLT -; CHECK-AVX-O0-NEXT:    movq (%rsp), %rax -; CHECK-AVX-O0-NEXT:    popq %rcx -; CHECK-AVX-O0-NEXT:    retq    %ret = load atomic <1 x i64>, ptr %x acquire, align 4    ret <1 x i64> %ret  }  define <1 x double> @atomic_vec1_double(ptr %x) nounwind { -; CHECK-O3-LABEL: atomic_vec1_double: -; CHECK-O3:       # %bb.0: -; CHECK-O3-NEXT:    pushq %rax -; CHECK-O3-NEXT:    movq %rdi, %rsi -; CHECK-O3-NEXT:    movq %rsp, %rdx -; CHECK-O3-NEXT:    movl $8, %edi -; CHECK-O3-NEXT:    movl $2, %ecx -; CHECK-O3-NEXT:    callq __atomic_load@PLT -; CHECK-O3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-O3-NEXT:    popq %rax -; CHECK-O3-NEXT:    retq -;  ; CHECK-SSE-O3-LABEL: atomic_vec1_double:  ; CHECK-SSE-O3:       # %bb.0:  ; CHECK-SSE-O3-NEXT:    pushq %rax @@ -576,18 +330,6 @@ define <1 x double> @atomic_vec1_double(ptr %x) nounwind {  ; CHECK-AVX-O3-NEXT:    popq %rax  ; CHECK-AVX-O3-NEXT:    retq  ; -; CHECK-O0-LABEL: atomic_vec1_double: -; CHECK-O0:       # %bb.0: -; CHECK-O0-NEXT:    pushq %rax -; CHECK-O0-NEXT:    movq %rdi, %rsi -; CHECK-O0-NEXT:    movl $8, %edi -; CHECK-O0-NEXT:    movq %rsp, %rdx -; CHECK-O0-NEXT:    movl $2, %ecx -; CHECK-O0-NEXT:    callq __atomic_load@PLT -; CHECK-O0-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-O0-NEXT:    popq %rax -; CHECK-O0-NEXT:    retq -;  ; CHECK-SSE-O0-LABEL: atomic_vec1_double:  ; CHECK-SSE-O0:       # %bb.0:  ; CHECK-SSE-O0-NEXT:    pushq %rax @@ -616,18 +358,6 @@ define <1 x double> @atomic_vec1_double(ptr %x) nounwind {  }  define <2 x i32> @atomic_vec2_i32(ptr %x) nounwind { -; CHECK-O3-LABEL: atomic_vec2_i32: -; CHECK-O3:       # %bb.0: -; CHECK-O3-NEXT:    pushq %rax -; CHECK-O3-NEXT:    movq %rdi, %rsi -; CHECK-O3-NEXT:    movq %rsp, %rdx -; CHECK-O3-NEXT:    movl $8, %edi -; CHECK-O3-NEXT:    movl $2, %ecx -; CHECK-O3-NEXT:    callq __atomic_load@PLT -; CHECK-O3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-O3-NEXT:    popq %rax -; CHECK-O3-NEXT:    retq -;  ; CHECK-SSE-O3-LABEL: atomic_vec2_i32:  ; CHECK-SSE-O3:       # %bb.0:  ; CHECK-SSE-O3-NEXT:    pushq %rax @@ -652,18 +382,6 @@ define <2 x i32> @atomic_vec2_i32(ptr %x) nounwind {  ; CHECK-AVX-O3-NEXT:    popq %rax  ; CHECK-AVX-O3-NEXT:    retq  ; -; CHECK-O0-LABEL: atomic_vec2_i32: -; CHECK-O0:       # %bb.0: -; CHECK-O0-NEXT:    pushq %rax -; CHECK-O0-NEXT:    movq %rdi, %rsi -; CHECK-O0-NEXT:    movl $8, %edi -; CHECK-O0-NEXT:    movq %rsp, %rdx -; CHECK-O0-NEXT:    movl $2, %ecx -; CHECK-O0-NEXT:    callq __atomic_load@PLT -; CHECK-O0-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero -; CHECK-O0-NEXT:    popq %rax -; CHECK-O0-NEXT:    retq -;  ; CHECK-SSE-O0-LABEL: atomic_vec2_i32:  ; CHECK-SSE-O0:       # %bb.0:  ; CHECK-SSE-O0-NEXT:    pushq %rax @@ -692,18 +410,6 @@ define <2 x i32> @atomic_vec2_i32(ptr %x) nounwind {  }  define <4 x float> @atomic_vec4_float(ptr %x) nounwind { -; CHECK-O3-LABEL: atomic_vec4_float: -; CHECK-O3:       # %bb.0: -; CHECK-O3-NEXT:    subq $24, %rsp -; CHECK-O3-NEXT:    movq %rdi, %rsi -; CHECK-O3-NEXT:    movq %rsp, %rdx -; CHECK-O3-NEXT:    movl $16, %edi -; CHECK-O3-NEXT:    movl $2, %ecx -; CHECK-O3-NEXT:    callq __atomic_load@PLT -; CHECK-O3-NEXT:    movaps (%rsp), %xmm0 -; CHECK-O3-NEXT:    addq $24, %rsp -; CHECK-O3-NEXT:    retq -;  ; CHECK-SSE-O3-LABEL: atomic_vec4_float:  ; CHECK-SSE-O3:       # %bb.0:  ; CHECK-SSE-O3-NEXT:    subq $24, %rsp @@ -728,18 +434,6 @@ define <4 x float> @atomic_vec4_float(ptr %x) nounwind {  ; CHECK-AVX-O3-NEXT:    addq $24, %rsp  ; CHECK-AVX-O3-NEXT:    retq  ; -; CHECK-O0-LABEL: atomic_vec4_float: -; CHECK-O0:       # %bb.0: -; CHECK-O0-NEXT:    subq $24, %rsp -; CHECK-O0-NEXT:    movq %rdi, %rsi -; CHECK-O0-NEXT:    movl $16, %edi -; CHECK-O0-NEXT:    movq %rsp, %rdx -; CHECK-O0-NEXT:    movl $2, %ecx -; CHECK-O0-NEXT:    callq __atomic_load@PLT -; CHECK-O0-NEXT:    movaps (%rsp), %xmm0 -; CHECK-O0-NEXT:    addq $24, %rsp -; CHECK-O0-NEXT:    retq -;  ; CHECK-SSE-O0-LABEL: atomic_vec4_float:  ; CHECK-SSE-O0:       # %bb.0:  ; CHECK-SSE-O0-NEXT:    subq $24, %rsp @@ -768,21 +462,6 @@ define <4 x float> @atomic_vec4_float(ptr %x) nounwind {  }  define <8 x double> @atomic_vec8_double(ptr %x) nounwind { -; CHECK-O3-LABEL: atomic_vec8_double: -; CHECK-O3:       # %bb.0: -; CHECK-O3-NEXT:    subq $72, %rsp -; CHECK-O3-NEXT:    movq %rdi, %rsi -; CHECK-O3-NEXT:    movq %rsp, %rdx -; CHECK-O3-NEXT:    movl $64, %edi -; CHECK-O3-NEXT:    movl $2, %ecx -; CHECK-O3-NEXT:    callq __atomic_load@PLT -; CHECK-O3-NEXT:    movaps (%rsp), %xmm0 -; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1 -; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2 -; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3 -; CHECK-O3-NEXT:    addq $72, %rsp -; CHECK-O3-NEXT:    retq -;  ; CHECK-SSE-O3-LABEL: atomic_vec8_double:  ; CHECK-SSE-O3:       # %bb.0:  ; CHECK-SSE-O3-NEXT:    subq $72, %rsp @@ -798,20 +477,30 @@ define <8 x double> @atomic_vec8_double(ptr %x) nounwind {  ; CHECK-SSE-O3-NEXT:    addq $72, %rsp  ; CHECK-SSE-O3-NEXT:    retq  ; -; CHECK-O0-LABEL: atomic_vec8_double: -; CHECK-O0:       # %bb.0: -; CHECK-O0-NEXT:    subq $72, %rsp -; CHECK-O0-NEXT:    movq %rdi, %rsi -; CHECK-O0-NEXT:    movl $64, %edi -; CHECK-O0-NEXT:    movq %rsp, %rdx -; CHECK-O0-NEXT:    movl $2, %ecx -; CHECK-O0-NEXT:    callq __atomic_load@PLT -; CHECK-O0-NEXT:    movapd (%rsp), %xmm0 -; CHECK-O0-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1 -; CHECK-O0-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2 -; CHECK-O0-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3 -; CHECK-O0-NEXT:    addq $72, %rsp -; CHECK-O0-NEXT:    retq +; CHECK-AVX2-O3-LABEL: atomic_vec8_double: +; CHECK-AVX2-O3:       # %bb.0: +; CHECK-AVX2-O3-NEXT:    subq $72, %rsp +; CHECK-AVX2-O3-NEXT:    movq %rdi, %rsi +; CHECK-AVX2-O3-NEXT:    movq %rsp, %rdx +; CHECK-AVX2-O3-NEXT:    movl $64, %edi +; CHECK-AVX2-O3-NEXT:    movl $2, %ecx +; CHECK-AVX2-O3-NEXT:    callq __atomic_load@PLT +; CHECK-AVX2-O3-NEXT:    vmovups (%rsp), %ymm0 +; CHECK-AVX2-O3-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm1 +; CHECK-AVX2-O3-NEXT:    addq $72, %rsp +; CHECK-AVX2-O3-NEXT:    retq +; +; CHECK-AVX512-O3-LABEL: atomic_vec8_double: +; CHECK-AVX512-O3:       # %bb.0: +; CHECK-AVX512-O3-NEXT:    subq $72, %rsp +; CHECK-AVX512-O3-NEXT:    movq %rdi, %rsi +; CHECK-AVX512-O3-NEXT:    movq %rsp, %rdx +; CHECK-AVX512-O3-NEXT:    movl $64, %edi +; CHECK-AVX512-O3-NEXT:    movl $2, %ecx +; CHECK-AVX512-O3-NEXT:    callq __atomic_load@PLT +; CHECK-AVX512-O3-NEXT:    vmovups (%rsp), %zmm0 +; CHECK-AVX512-O3-NEXT:    addq $72, %rsp +; CHECK-AVX512-O3-NEXT:    retq  ;  ; CHECK-SSE-O0-LABEL: atomic_vec8_double:  ; CHECK-SSE-O0:       # %bb.0: @@ -827,24 +516,36 @@ define <8 x double> @atomic_vec8_double(ptr %x) nounwind {  ; CHECK-SSE-O0-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3  ; CHECK-SSE-O0-NEXT:    addq $72, %rsp  ; CHECK-SSE-O0-NEXT:    retq +; +; CHECK-AVX2-O0-LABEL: atomic_vec8_double: +; CHECK-AVX2-O0:       # %bb.0: +; CHECK-AVX2-O0-NEXT:    subq $72, %rsp +; CHECK-AVX2-O0-NEXT:    movq %rdi, %rsi +; CHECK-AVX2-O0-NEXT:    movl $64, %edi +; CHECK-AVX2-O0-NEXT:    movq %rsp, %rdx +; CHECK-AVX2-O0-NEXT:    movl $2, %ecx +; CHECK-AVX2-O0-NEXT:    callq __atomic_load@PLT +; CHECK-AVX2-O0-NEXT:    vmovupd (%rsp), %ymm0 +; CHECK-AVX2-O0-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm1 +; CHECK-AVX2-O0-NEXT:    addq $72, %rsp +; CHECK-AVX2-O0-NEXT:    retq +; +; CHECK-AVX512-O0-LABEL: atomic_vec8_double: +; CHECK-AVX512-O0:       # %bb.0: +; CHECK-AVX512-O0-NEXT:    subq $72, %rsp +; CHECK-AVX512-O0-NEXT:    movq %rdi, %rsi +; CHECK-AVX512-O0-NEXT:    movl $64, %edi +; CHECK-AVX512-O0-NEXT:    movq %rsp, %rdx +; CHECK-AVX512-O0-NEXT:    movl $2, %ecx +; CHECK-AVX512-O0-NEXT:    callq __atomic_load@PLT +; CHECK-AVX512-O0-NEXT:    vmovupd (%rsp), %zmm0 +; CHECK-AVX512-O0-NEXT:    addq $72, %rsp +; CHECK-AVX512-O0-NEXT:    retq    %ret = load atomic <8 x double>, ptr %x acquire, align 4    ret <8 x double> %ret  }  define <16 x bfloat> @atomic_vec16_bfloat(ptr %x) nounwind { -; CHECK-O3-LABEL: atomic_vec16_bfloat: -; CHECK-O3:       # %bb.0: -; CHECK-O3-NEXT:    subq $40, %rsp -; CHECK-O3-NEXT:    movq %rdi, %rsi -; CHECK-O3-NEXT:    movq %rsp, %rdx -; CHECK-O3-NEXT:    movl $32, %edi -; CHECK-O3-NEXT:    movl $2, %ecx -; CHECK-O3-NEXT:    callq __atomic_load@PLT -; CHECK-O3-NEXT:    movaps (%rsp), %xmm0 -; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1 -; CHECK-O3-NEXT:    addq $40, %rsp -; CHECK-O3-NEXT:    retq -;  ; CHECK-SSE-O3-LABEL: atomic_vec16_bfloat:  ; CHECK-SSE-O3:       # %bb.0:  ; CHECK-SSE-O3-NEXT:    subq $40, %rsp @@ -870,19 +571,6 @@ define <16 x bfloat> @atomic_vec16_bfloat(ptr %x) nounwind {  ; CHECK-AVX-O3-NEXT:    addq $40, %rsp  ; CHECK-AVX-O3-NEXT:    retq  ; -; CHECK-O0-LABEL: atomic_vec16_bfloat: -; CHECK-O0:       # %bb.0: -; CHECK-O0-NEXT:    subq $40, %rsp -; CHECK-O0-NEXT:    movq %rdi, %rsi -; CHECK-O0-NEXT:    movl $32, %edi -; CHECK-O0-NEXT:    movq %rsp, %rdx -; CHECK-O0-NEXT:    movl $2, %ecx -; CHECK-O0-NEXT:    callq __atomic_load@PLT -; CHECK-O0-NEXT:    movaps (%rsp), %xmm0 -; CHECK-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1 -; CHECK-O0-NEXT:    addq $40, %rsp -; CHECK-O0-NEXT:    retq -;  ; CHECK-SSE-O0-LABEL: atomic_vec16_bfloat:  ; CHECK-SSE-O0:       # %bb.0:  ; CHECK-SSE-O0-NEXT:    subq $40, %rsp @@ -912,21 +600,6 @@ define <16 x bfloat> @atomic_vec16_bfloat(ptr %x) nounwind {  }  define <32 x half> @atomic_vec32_half(ptr %x) nounwind { -; CHECK-O3-LABEL: atomic_vec32_half: -; CHECK-O3:       # %bb.0: -; CHECK-O3-NEXT:    subq $72, %rsp -; CHECK-O3-NEXT:    movq %rdi, %rsi -; CHECK-O3-NEXT:    movq %rsp, %rdx -; CHECK-O3-NEXT:    movl $64, %edi -; CHECK-O3-NEXT:    movl $2, %ecx -; CHECK-O3-NEXT:    callq __atomic_load@PLT -; CHECK-O3-NEXT:    movaps (%rsp), %xmm0 -; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1 -; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2 -; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3 -; CHECK-O3-NEXT:    addq $72, %rsp -; CHECK-O3-NEXT:    retq -;  ; CHECK-SSE-O3-LABEL: atomic_vec32_half:  ; CHECK-SSE-O3:       # %bb.0:  ; CHECK-SSE-O3-NEXT:    subq $72, %rsp @@ -942,20 +615,30 @@ define <32 x half> @atomic_vec32_half(ptr %x) nounwind {  ; CHECK-SSE-O3-NEXT:    addq $72, %rsp  ; CHECK-SSE-O3-NEXT:    retq  ; -; CHECK-O0-LABEL: atomic_vec32_half: -; CHECK-O0:       # %bb.0: -; CHECK-O0-NEXT:    subq $72, %rsp -; CHECK-O0-NEXT:    movq %rdi, %rsi -; CHECK-O0-NEXT:    movl $64, %edi -; CHECK-O0-NEXT:    movq %rsp, %rdx -; CHECK-O0-NEXT:    movl $2, %ecx -; CHECK-O0-NEXT:    callq __atomic_load@PLT -; CHECK-O0-NEXT:    movaps (%rsp), %xmm0 -; CHECK-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1 -; CHECK-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2 -; CHECK-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3 -; CHECK-O0-NEXT:    addq $72, %rsp -; CHECK-O0-NEXT:    retq +; CHECK-AVX2-O3-LABEL: atomic_vec32_half: +; CHECK-AVX2-O3:       # %bb.0: +; CHECK-AVX2-O3-NEXT:    subq $72, %rsp +; CHECK-AVX2-O3-NEXT:    movq %rdi, %rsi +; CHECK-AVX2-O3-NEXT:    movq %rsp, %rdx +; CHECK-AVX2-O3-NEXT:    movl $64, %edi +; CHECK-AVX2-O3-NEXT:    movl $2, %ecx +; CHECK-AVX2-O3-NEXT:    callq __atomic_load@PLT +; CHECK-AVX2-O3-NEXT:    vmovups (%rsp), %ymm0 +; CHECK-AVX2-O3-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm1 +; CHECK-AVX2-O3-NEXT:    addq $72, %rsp +; CHECK-AVX2-O3-NEXT:    retq +; +; CHECK-AVX512-O3-LABEL: atomic_vec32_half: +; CHECK-AVX512-O3:       # %bb.0: +; CHECK-AVX512-O3-NEXT:    subq $72, %rsp +; CHECK-AVX512-O3-NEXT:    movq %rdi, %rsi +; CHECK-AVX512-O3-NEXT:    movq %rsp, %rdx +; CHECK-AVX512-O3-NEXT:    movl $64, %edi +; CHECK-AVX512-O3-NEXT:    movl $2, %ecx +; CHECK-AVX512-O3-NEXT:    callq __atomic_load@PLT +; CHECK-AVX512-O3-NEXT:    vmovups (%rsp), %zmm0 +; CHECK-AVX512-O3-NEXT:    addq $72, %rsp +; CHECK-AVX512-O3-NEXT:    retq  ;  ; CHECK-SSE-O0-LABEL: atomic_vec32_half:  ; CHECK-SSE-O0:       # %bb.0: @@ -971,6 +654,31 @@ define <32 x half> @atomic_vec32_half(ptr %x) nounwind {  ; CHECK-SSE-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3  ; CHECK-SSE-O0-NEXT:    addq $72, %rsp  ; CHECK-SSE-O0-NEXT:    retq +; +; CHECK-AVX2-O0-LABEL: atomic_vec32_half: +; CHECK-AVX2-O0:       # %bb.0: +; CHECK-AVX2-O0-NEXT:    subq $72, %rsp +; CHECK-AVX2-O0-NEXT:    movq %rdi, %rsi +; CHECK-AVX2-O0-NEXT:    movl $64, %edi +; CHECK-AVX2-O0-NEXT:    movq %rsp, %rdx +; CHECK-AVX2-O0-NEXT:    movl $2, %ecx +; CHECK-AVX2-O0-NEXT:    callq __atomic_load@PLT +; CHECK-AVX2-O0-NEXT:    vmovups (%rsp), %ymm0 +; CHECK-AVX2-O0-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm1 +; CHECK-AVX2-O0-NEXT:    addq $72, %rsp +; CHECK-AVX2-O0-NEXT:    retq +; +; CHECK-AVX512-O0-LABEL: atomic_vec32_half: +; CHECK-AVX512-O0:       # %bb.0: +; CHECK-AVX512-O0-NEXT:    subq $72, %rsp +; CHECK-AVX512-O0-NEXT:    movq %rdi, %rsi +; CHECK-AVX512-O0-NEXT:    movl $64, %edi +; CHECK-AVX512-O0-NEXT:    movq %rsp, %rdx +; CHECK-AVX512-O0-NEXT:    movl $2, %ecx +; CHECK-AVX512-O0-NEXT:    callq __atomic_load@PLT +; CHECK-AVX512-O0-NEXT:    vmovups (%rsp), %zmm0 +; CHECK-AVX512-O0-NEXT:    addq $72, %rsp +; CHECK-AVX512-O0-NEXT:    retq    %ret = load atomic <32 x half>, ptr %x acquire, align 4    ret <32 x half> %ret  } diff --git a/llvm/test/CodeGen/X86/bfloat-calling-conv.ll b/llvm/test/CodeGen/X86/bfloat-calling-conv.ll index ea4d32b..d087491 100644 --- a/llvm/test/CodeGen/X86/bfloat-calling-conv.ll +++ b/llvm/test/CodeGen/X86/bfloat-calling-conv.ll @@ -660,8 +660,7 @@ define <3 x bfloat> @call_ret_v3bf16(ptr %ptr) #0 {  ; SSE2-LABEL: call_ret_v3bf16:  ; SSE2:       # %bb.0:  ; SSE2-NEXT:    pushq %rax -; SSE2-NEXT:    movl 4(%rdi), %eax -; SSE2-NEXT:    pinsrw $0, %eax, %xmm1 +; SSE2-NEXT:    pinsrw $0, 4(%rdi), %xmm1  ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero  ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]  ; SSE2-NEXT:    callq returns_v3bf16@PLT @@ -725,8 +724,7 @@ define <3 x bfloat> @call_ret_v3bf16(ptr %ptr) #0 {  ; AVXNECONVERT-LABEL: call_ret_v3bf16:  ; AVXNECONVERT:       # %bb.0:  ; AVXNECONVERT-NEXT:    pushq %rax -; AVXNECONVERT-NEXT:    movl 4(%rdi), %eax -; AVXNECONVERT-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0 +; AVXNECONVERT-NEXT:    vpinsrw $0, 4(%rdi), %xmm0, %xmm0  ; AVXNECONVERT-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero  ; AVXNECONVERT-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero  ; AVXNECONVERT-NEXT:    callq returns_v3bf16@PLT diff --git a/llvm/test/CodeGen/X86/trunc-srl-load.ll b/llvm/test/CodeGen/X86/trunc-srl-load.ll index 4dae143..d9c21d3 100644 --- a/llvm/test/CodeGen/X86/trunc-srl-load.ll +++ b/llvm/test/CodeGen/X86/trunc-srl-load.ll @@ -1,9 +1,9 @@  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py  ; RUN: llc < %s -mtriple=i686-unknown                   | FileCheck %s --check-prefixes=X86 -; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64    | FileCheck %s --check-prefixes=X64,SSE -; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE -; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64    | FileCheck %s --check-prefixes=X64 +; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64 +; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64 +; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64  ; Tests showing for the analysis of non-constant shift amounts to improve load address math @@ -12,42 +12,20 @@  define i16 @extractSub64_16(ptr %word, i32 %idx) nounwind {  ; X86-LABEL: extractSub64_16:  ; X86:       # %bb.0: -; X86-NEXT:    pushl %esi -; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch  ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax -; X86-NEXT:    movl (%eax), %edx -; X86-NEXT:    movl 4(%eax), %esi -; X86-NEXT:    movb %ch, %cl -; X86-NEXT:    andb $16, %cl -; X86-NEXT:    movl %esi, %eax -; X86-NEXT:    shrl %cl, %eax -; X86-NEXT:    shrdl %cl, %esi, %edx -; X86-NEXT:    testb $32, %ch -; X86-NEXT:    jne .LBB0_2 -; X86-NEXT:  # %bb.1: -; X86-NEXT:    movl %edx, %eax -; X86-NEXT:  .LBB0_2: -; X86-NEXT:    # kill: def $ax killed $ax killed $eax -; X86-NEXT:    popl %esi +; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT:    andl $48, %ecx +; X86-NEXT:    shrl $3, %ecx +; X86-NEXT:    movzwl (%eax,%ecx), %eax  ; X86-NEXT:    retl  ; -; SSE-LABEL: extractSub64_16: -; SSE:       # %bb.0: -; SSE-NEXT:    movl %esi, %ecx -; SSE-NEXT:    movq (%rdi), %rax -; SSE-NEXT:    andb $48, %cl -; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx -; SSE-NEXT:    shrq %cl, %rax -; SSE-NEXT:    # kill: def $ax killed $ax killed $rax -; SSE-NEXT:    retq -; -; AVX-LABEL: extractSub64_16: -; AVX:       # %bb.0: -; AVX-NEXT:    # kill: def $esi killed $esi def $rsi -; AVX-NEXT:    andb $48, %sil -; AVX-NEXT:    shrxq %rsi, (%rdi), %rax -; AVX-NEXT:    # kill: def $ax killed $ax killed $rax -; AVX-NEXT:    retq +; X64-LABEL: extractSub64_16: +; X64:       # %bb.0: +; X64-NEXT:    # kill: def $esi killed $esi def $rsi +; X64-NEXT:    andl $48, %esi +; X64-NEXT:    shrl $3, %esi +; X64-NEXT:    movzwl (%rdi,%rsi), %eax +; X64-NEXT:    retq    %idx_bounds = and i32 %idx, 63    %idx_align = and i32 %idx_bounds, -16    %sh = zext nneg i32 %idx_align to i64 @@ -60,67 +38,20 @@ define i16 @extractSub64_16(ptr %word, i32 %idx) nounwind {  define i16 @extractSub128_16(ptr %word, i32 %idx) nounwind {  ; X86-LABEL: extractSub128_16:  ; X86:       # %bb.0: -; X86-NEXT:    pushl %ebp -; X86-NEXT:    movl %esp, %ebp -; X86-NEXT:    pushl %edi -; X86-NEXT:    pushl %esi -; X86-NEXT:    andl $-16, %esp -; X86-NEXT:    subl $32, %esp -; X86-NEXT:    movzbl 12(%ebp), %eax -; X86-NEXT:    movl 8(%ebp), %ecx -; X86-NEXT:    movl (%ecx), %edx -; X86-NEXT:    movl 4(%ecx), %esi -; X86-NEXT:    movl 8(%ecx), %edi -; X86-NEXT:    movl 12(%ecx), %ecx -; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %edx, (%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %eax, %ecx -; X86-NEXT:    andb $16, %cl -; X86-NEXT:    shrb $3, %al -; X86-NEXT:    andb $12, %al -; X86-NEXT:    movzbl %al, %edx -; X86-NEXT:    movl (%esp,%edx), %eax -; X86-NEXT:    movl 4(%esp,%edx), %edx -; X86-NEXT:    shrdl %cl, %edx, %eax -; X86-NEXT:    # kill: def $ax killed $ax killed $eax -; X86-NEXT:    leal -8(%ebp), %esp -; X86-NEXT:    popl %esi -; X86-NEXT:    popl %edi -; X86-NEXT:    popl %ebp +; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT:    andl $112, %ecx +; X86-NEXT:    shrl $3, %ecx +; X86-NEXT:    movzwl (%eax,%ecx), %eax  ; X86-NEXT:    retl  ; -; SSE-LABEL: extractSub128_16: -; SSE:       # %bb.0: -; SSE-NEXT:    movq (%rdi), %rax -; SSE-NEXT:    movq 8(%rdi), %rdx -; SSE-NEXT:    movl %esi, %ecx -; SSE-NEXT:    andb $48, %cl -; SSE-NEXT:    movq %rdx, %rdi -; SSE-NEXT:    shrq %cl, %rdi -; SSE-NEXT:    shrdq %cl, %rdx, %rax -; SSE-NEXT:    testb $64, %sil -; SSE-NEXT:    cmovneq %rdi, %rax -; SSE-NEXT:    # kill: def $ax killed $ax killed $rax -; SSE-NEXT:    retq -; -; AVX-LABEL: extractSub128_16: -; AVX:       # %bb.0: -; AVX-NEXT:    movq (%rdi), %rdx -; AVX-NEXT:    movq 8(%rdi), %rax -; AVX-NEXT:    movl %esi, %ecx -; AVX-NEXT:    andb $48, %cl -; AVX-NEXT:    shrdq %cl, %rax, %rdx -; AVX-NEXT:    shrxq %rcx, %rax, %rax -; AVX-NEXT:    testb $64, %sil -; AVX-NEXT:    cmoveq %rdx, %rax -; AVX-NEXT:    # kill: def $ax killed $ax killed $rax -; AVX-NEXT:    retq +; X64-LABEL: extractSub128_16: +; X64:       # %bb.0: +; X64-NEXT:    # kill: def $esi killed $esi def $rsi +; X64-NEXT:    andl $112, %esi +; X64-NEXT:    shrl $3, %esi +; X64-NEXT:    movzwl (%rdi,%rsi), %eax +; X64-NEXT:    retq    %idx_bounds = and i32 %idx, 127    %idx_align = and i32 %idx_bounds, -16    %sh = zext nneg i32 %idx_align to i128 @@ -133,62 +64,20 @@ define i16 @extractSub128_16(ptr %word, i32 %idx) nounwind {  define i32 @extractSub128_32(ptr %word, i32 %idx) nounwind {  ; X86-LABEL: extractSub128_32:  ; X86:       # %bb.0: -; X86-NEXT:    pushl %ebp -; X86-NEXT:    movl %esp, %ebp -; X86-NEXT:    pushl %edi -; X86-NEXT:    pushl %esi -; X86-NEXT:    andl $-16, %esp -; X86-NEXT:    subl $32, %esp -; X86-NEXT:    movzbl 12(%ebp), %eax -; X86-NEXT:    movl 8(%ebp), %ecx -; X86-NEXT:    movl (%ecx), %edx -; X86-NEXT:    movl 4(%ecx), %esi -; X86-NEXT:    movl 8(%ecx), %edi -; X86-NEXT:    movl 12(%ecx), %ecx -; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %edx, (%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    andb $96, %al -; X86-NEXT:    shrb $3, %al -; X86-NEXT:    movzbl %al, %eax -; X86-NEXT:    movl (%esp,%eax), %eax -; X86-NEXT:    leal -8(%ebp), %esp -; X86-NEXT:    popl %esi -; X86-NEXT:    popl %edi -; X86-NEXT:    popl %ebp +; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT:    andl $96, %ecx +; X86-NEXT:    shrl $3, %ecx +; X86-NEXT:    movl (%eax,%ecx), %eax  ; X86-NEXT:    retl  ; -; SSE-LABEL: extractSub128_32: -; SSE:       # %bb.0: -; SSE-NEXT:    movq (%rdi), %rax -; SSE-NEXT:    movq 8(%rdi), %rdx -; SSE-NEXT:    movl %esi, %ecx -; SSE-NEXT:    andb $32, %cl -; SSE-NEXT:    movq %rdx, %rdi -; SSE-NEXT:    shrq %cl, %rdi -; SSE-NEXT:    shrdq %cl, %rdx, %rax -; SSE-NEXT:    testb $64, %sil -; SSE-NEXT:    cmovneq %rdi, %rax -; SSE-NEXT:    # kill: def $eax killed $eax killed $rax -; SSE-NEXT:    retq -; -; AVX-LABEL: extractSub128_32: -; AVX:       # %bb.0: -; AVX-NEXT:    movq (%rdi), %rdx -; AVX-NEXT:    movq 8(%rdi), %rax -; AVX-NEXT:    movl %esi, %ecx -; AVX-NEXT:    andb $32, %cl -; AVX-NEXT:    shrdq %cl, %rax, %rdx -; AVX-NEXT:    shrxq %rcx, %rax, %rax -; AVX-NEXT:    testb $64, %sil -; AVX-NEXT:    cmoveq %rdx, %rax -; AVX-NEXT:    # kill: def $eax killed $eax killed $rax -; AVX-NEXT:    retq +; X64-LABEL: extractSub128_32: +; X64:       # %bb.0: +; X64-NEXT:    # kill: def $esi killed $esi def $rsi +; X64-NEXT:    andl $96, %esi +; X64-NEXT:    shrl $3, %esi +; X64-NEXT:    movl (%rdi,%rsi), %eax +; X64-NEXT:    retq    %idx_bounds = and i32 %idx, 127    %idx_align = and i32 %idx_bounds, -32    %sh = zext nneg i32 %idx_align to i128 @@ -201,46 +90,20 @@ define i32 @extractSub128_32(ptr %word, i32 %idx) nounwind {  define i64 @extractSub128_64(ptr %word, i32 %idx) nounwind {  ; X86-LABEL: extractSub128_64:  ; X86:       # %bb.0: -; X86-NEXT:    pushl %ebp -; X86-NEXT:    movl %esp, %ebp -; X86-NEXT:    pushl %edi -; X86-NEXT:    pushl %esi -; X86-NEXT:    andl $-16, %esp -; X86-NEXT:    subl $32, %esp -; X86-NEXT:    movzbl 12(%ebp), %eax -; X86-NEXT:    movl 8(%ebp), %ecx -; X86-NEXT:    movl (%ecx), %edx -; X86-NEXT:    movl 4(%ecx), %esi -; X86-NEXT:    movl 8(%ecx), %edi -; X86-NEXT:    movl 12(%ecx), %ecx -; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %edx, (%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    andb $64, %al -; X86-NEXT:    shrb $3, %al -; X86-NEXT:    movzbl %al, %ecx -; X86-NEXT:    movl (%esp,%ecx), %eax -; X86-NEXT:    movl 4(%esp,%ecx), %edx -; X86-NEXT:    leal -8(%ebp), %esp -; X86-NEXT:    popl %esi -; X86-NEXT:    popl %edi -; X86-NEXT:    popl %ebp +; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx +; X86-NEXT:    andl $64, %edx +; X86-NEXT:    shrl $3, %edx +; X86-NEXT:    movl (%ecx,%edx), %eax +; X86-NEXT:    movl 4(%ecx,%edx), %edx  ; X86-NEXT:    retl  ;  ; X64-LABEL: extractSub128_64:  ; X64:       # %bb.0: -; X64-NEXT:    testb $64, %sil -; X64-NEXT:    je .LBB3_1 -; X64-NEXT:  # %bb.2: -; X64-NEXT:    movq 8(%rdi), %rax -; X64-NEXT:    retq -; X64-NEXT:  .LBB3_1: -; X64-NEXT:    movq (%rdi), %rax +; X64-NEXT:    # kill: def $esi killed $esi def $rsi +; X64-NEXT:    andl $64, %esi +; X64-NEXT:    shrl $3, %esi +; X64-NEXT:    movq (%rdi,%rsi), %rax  ; X64-NEXT:    retq    %idx_bounds = and i32 %idx, 127    %idx_align = and i32 %idx_bounds, -64 @@ -254,185 +117,20 @@ define i64 @extractSub128_64(ptr %word, i32 %idx) nounwind {  define i8 @extractSub512_8(ptr %word, i32 %idx) nounwind {  ; X86-LABEL: extractSub512_8:  ; X86:       # %bb.0: -; X86-NEXT:    pushl %ebp -; X86-NEXT:    movl %esp, %ebp -; X86-NEXT:    pushl %ebx -; X86-NEXT:    pushl %edi -; X86-NEXT:    pushl %esi -; X86-NEXT:    andl $-16, %esp -; X86-NEXT:    subl $192, %esp -; X86-NEXT:    movl 8(%ebp), %eax -; X86-NEXT:    movl (%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 4(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 8(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 12(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 16(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 20(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 24(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 28(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 32(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 36(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 40(%eax), %ebx -; X86-NEXT:    movl 44(%eax), %edi -; X86-NEXT:    movl 48(%eax), %esi -; X86-NEXT:    movl 52(%eax), %edx -; X86-NEXT:    movl 56(%eax), %ecx -; X86-NEXT:    movl 60(%eax), %eax -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl 12(%ebp), %edx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %edx, %ecx -; X86-NEXT:    andl $24, %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    shrl $3, %edx -; X86-NEXT:    andl $60, %edx -; X86-NEXT:    movl 48(%esp,%edx), %eax -; X86-NEXT:    movl 52(%esp,%edx), %edx -; X86-NEXT:    # kill: def $cl killed $cl killed $ecx -; X86-NEXT:    shrdl %cl, %edx, %eax -; X86-NEXT:    # kill: def $al killed $al killed $eax -; X86-NEXT:    leal -12(%ebp), %esp -; X86-NEXT:    popl %esi -; X86-NEXT:    popl %edi -; X86-NEXT:    popl %ebx -; X86-NEXT:    popl %ebp +; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT:    shrl $3, %ecx +; X86-NEXT:    andl $63, %ecx +; X86-NEXT:    movzbl (%eax,%ecx), %eax  ; X86-NEXT:    retl  ; -; SSE-LABEL: extractSub512_8: -; SSE:       # %bb.0: -; SSE-NEXT:    pushq %rax -; SSE-NEXT:    # kill: def $esi killed $esi def $rsi -; SSE-NEXT:    movups (%rdi), %xmm0 -; SSE-NEXT:    movups 16(%rdi), %xmm1 -; SSE-NEXT:    movups 32(%rdi), %xmm2 -; SSE-NEXT:    movups 48(%rdi), %xmm3 -; SSE-NEXT:    xorps %xmm4, %xmm4 -; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movl %esi, %ecx -; SSE-NEXT:    andl $56, %ecx -; SSE-NEXT:    shrl $3, %esi -; SSE-NEXT:    andl $56, %esi -; SSE-NEXT:    movq -128(%rsp,%rsi), %rdx -; SSE-NEXT:    shrq %cl, %rdx -; SSE-NEXT:    movl -120(%rsp,%rsi), %eax -; SSE-NEXT:    addl %eax, %eax -; SSE-NEXT:    notl %ecx -; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx -; SSE-NEXT:    shlq %cl, %rax -; SSE-NEXT:    orl %edx, %eax -; SSE-NEXT:    # kill: def $al killed $al killed $rax -; SSE-NEXT:    popq %rcx -; SSE-NEXT:    retq -; -; AVX2-LABEL: extractSub512_8: -; AVX2:       # %bb.0: -; AVX2-NEXT:    pushq %rax -; AVX2-NEXT:    # kill: def $esi killed $esi def $rsi -; AVX2-NEXT:    vmovups (%rdi), %ymm0 -; AVX2-NEXT:    vmovups 32(%rdi), %ymm1 -; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2 -; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT:    movl %esi, %ecx -; AVX2-NEXT:    andl $56, %ecx -; AVX2-NEXT:    shrl $3, %esi -; AVX2-NEXT:    andl $56, %esi -; AVX2-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %rax -; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT:    notl %ecx -; AVX2-NEXT:    movl -120(%rsp,%rsi), %edx -; AVX2-NEXT:    addl %edx, %edx -; AVX2-NEXT:    shlxq %rcx, %rdx, %rcx -; AVX2-NEXT:    orl %ecx, %eax -; AVX2-NEXT:    # kill: def $al killed $al killed $rax -; AVX2-NEXT:    popq %rcx -; AVX2-NEXT:    vzeroupper -; AVX2-NEXT:    retq -; -; AVX512-LABEL: extractSub512_8: -; AVX512:       # %bb.0: -; AVX512-NEXT:    pushq %rax -; AVX512-NEXT:    vmovups (%rdi), %ymm0 -; AVX512-NEXT:    vmovups 32(%rdi), %ymm1 -; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2 -; AVX512-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT:    # kill: def $esi killed $esi def $rsi -; AVX512-NEXT:    movl %esi, %ecx -; AVX512-NEXT:    andl $56, %ecx -; AVX512-NEXT:    shrl $3, %esi -; AVX512-NEXT:    andl $56, %esi -; AVX512-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %rax -; AVX512-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX512-NEXT:    notl %ecx -; AVX512-NEXT:    movl -120(%rsp,%rsi), %edx -; AVX512-NEXT:    addl %edx, %edx -; AVX512-NEXT:    shlxq %rcx, %rdx, %rcx -; AVX512-NEXT:    orl %ecx, %eax -; AVX512-NEXT:    # kill: def $al killed $al killed $rax -; AVX512-NEXT:    popq %rcx -; AVX512-NEXT:    vzeroupper -; AVX512-NEXT:    retq +; X64-LABEL: extractSub512_8: +; X64:       # %bb.0: +; X64-NEXT:    # kill: def $esi killed $esi def $rsi +; X64-NEXT:    shrl $3, %esi +; X64-NEXT:    andl $63, %esi +; X64-NEXT:    movzbl (%rdi,%rsi), %eax +; X64-NEXT:    retq    %idx_bounds = and i32 %idx, 511    %idx_align = and i32 %idx_bounds, -8    %ld = load i512, ptr %word, align 8 @@ -445,152 +143,21 @@ define i8 @extractSub512_8(ptr %word, i32 %idx) nounwind {  define i64 @extractSub512_64(ptr %word, i32 %idx) nounwind {  ; X86-LABEL: extractSub512_64:  ; X86:       # %bb.0: -; X86-NEXT:    pushl %ebp -; X86-NEXT:    movl %esp, %ebp -; X86-NEXT:    pushl %ebx -; X86-NEXT:    pushl %edi -; X86-NEXT:    pushl %esi -; X86-NEXT:    andl $-16, %esp -; X86-NEXT:    subl $192, %esp -; X86-NEXT:    movl 8(%ebp), %eax -; X86-NEXT:    movl (%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 4(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 8(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 12(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 16(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 20(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 24(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 28(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 32(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 36(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 40(%eax), %ebx -; X86-NEXT:    movl 44(%eax), %edi -; X86-NEXT:    movl 48(%eax), %esi -; X86-NEXT:    movl 52(%eax), %edx -; X86-NEXT:    movl 56(%eax), %ecx -; X86-NEXT:    movl 60(%eax), %eax -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl 12(%ebp), %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    shrl $3, %ecx -; X86-NEXT:    andl $56, %ecx -; X86-NEXT:    movl 48(%esp,%ecx), %eax -; X86-NEXT:    movl 52(%esp,%ecx), %edx -; X86-NEXT:    leal -12(%ebp), %esp -; X86-NEXT:    popl %esi -; X86-NEXT:    popl %edi -; X86-NEXT:    popl %ebx -; X86-NEXT:    popl %ebp +; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx +; X86-NEXT:    shrl $3, %edx +; X86-NEXT:    andl $56, %edx +; X86-NEXT:    movl (%ecx,%edx), %eax +; X86-NEXT:    movl 4(%ecx,%edx), %edx  ; X86-NEXT:    retl  ; -; SSE-LABEL: extractSub512_64: -; SSE:       # %bb.0: -; SSE-NEXT:    pushq %rax -; SSE-NEXT:    # kill: def $esi killed $esi def $rsi -; SSE-NEXT:    movups (%rdi), %xmm0 -; SSE-NEXT:    movups 16(%rdi), %xmm1 -; SSE-NEXT:    movups 32(%rdi), %xmm2 -; SSE-NEXT:    movups 48(%rdi), %xmm3 -; SSE-NEXT:    xorps %xmm4, %xmm4 -; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    shrl $3, %esi -; SSE-NEXT:    andl $56, %esi -; SSE-NEXT:    movq -128(%rsp,%rsi), %rax -; SSE-NEXT:    popq %rcx -; SSE-NEXT:    retq -; -; AVX2-LABEL: extractSub512_64: -; AVX2:       # %bb.0: -; AVX2-NEXT:    pushq %rax -; AVX2-NEXT:    vmovups (%rdi), %ymm0 -; AVX2-NEXT:    vmovups 32(%rdi), %ymm1 -; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2 -; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT:    # kill: def $esi killed $esi def $rsi -; AVX2-NEXT:    shrl $3, %esi -; AVX2-NEXT:    andl $56, %esi -; AVX2-NEXT:    movq -128(%rsp,%rsi), %rax -; AVX2-NEXT:    popq %rcx -; AVX2-NEXT:    vzeroupper -; AVX2-NEXT:    retq -; -; AVX512-LABEL: extractSub512_64: -; AVX512:       # %bb.0: -; AVX512-NEXT:    pushq %rax -; AVX512-NEXT:    vmovups (%rdi), %ymm0 -; AVX512-NEXT:    vmovups 32(%rdi), %ymm1 -; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2 -; AVX512-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp) -; AVX512-NEXT:    # kill: def $esi killed $esi def $rsi -; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT:    shrl $3, %esi -; AVX512-NEXT:    andl $56, %esi -; AVX512-NEXT:    movq -128(%rsp,%rsi), %rax -; AVX512-NEXT:    popq %rcx -; AVX512-NEXT:    vzeroupper -; AVX512-NEXT:    retq +; X64-LABEL: extractSub512_64: +; X64:       # %bb.0: +; X64-NEXT:    # kill: def $esi killed $esi def $rsi +; X64-NEXT:    shrl $3, %esi +; X64-NEXT:    andl $56, %esi +; X64-NEXT:    movq (%rdi,%rsi), %rax +; X64-NEXT:    retq    %idx_bounds = and i32 %idx, 511    %idx_align = and i32 %idx_bounds, -64    %sh = zext nneg i32 %idx_align to i512 @@ -603,143 +170,35 @@ define i64 @extractSub512_64(ptr %word, i32 %idx) nounwind {  define i128 @extractSub512_128(ptr %word, i32 %idx) nounwind {  ; X86-LABEL: extractSub512_128:  ; X86:       # %bb.0: -; X86-NEXT:    pushl %ebp -; X86-NEXT:    movl %esp, %ebp  ; X86-NEXT:    pushl %ebx  ; X86-NEXT:    pushl %edi  ; X86-NEXT:    pushl %esi -; X86-NEXT:    andl $-16, %esp -; X86-NEXT:    subl $192, %esp -; X86-NEXT:    movl 12(%ebp), %eax -; X86-NEXT:    movl (%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 4(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 8(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 12(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 16(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 20(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 24(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 28(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 32(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 36(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 40(%eax), %ebx -; X86-NEXT:    movl 44(%eax), %edi -; X86-NEXT:    movl 48(%eax), %esi -; X86-NEXT:    movl 52(%eax), %edx -; X86-NEXT:    movl 56(%eax), %ecx -; X86-NEXT:    movl 60(%eax), %eax -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl 8(%ebp), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp) -; X86-NEXT:    movl 16(%ebp), %edi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    shrl $3, %edi -; X86-NEXT:    andl $48, %edi -; X86-NEXT:    movl 48(%esp,%edi), %ecx -; X86-NEXT:    movl 52(%esp,%edi), %edx -; X86-NEXT:    movl 56(%esp,%edi), %esi -; X86-NEXT:    movl 60(%esp,%edi), %edi -; X86-NEXT:    movl %edi, 12(%eax) -; X86-NEXT:    movl %esi, 8(%eax) -; X86-NEXT:    movl %edx, 4(%eax) -; X86-NEXT:    movl %ecx, (%eax) -; X86-NEXT:    leal -12(%ebp), %esp +; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx +; X86-NEXT:    shrl $3, %edx +; X86-NEXT:    andl $48, %edx +; X86-NEXT:    movl (%ecx,%edx), %esi +; X86-NEXT:    movl 4(%ecx,%edx), %edi +; X86-NEXT:    movl 8(%ecx,%edx), %ebx +; X86-NEXT:    movl 12(%ecx,%edx), %ecx +; X86-NEXT:    movl %ecx, 12(%eax) +; X86-NEXT:    movl %ebx, 8(%eax) +; X86-NEXT:    movl %edi, 4(%eax) +; X86-NEXT:    movl %esi, (%eax)  ; X86-NEXT:    popl %esi  ; X86-NEXT:    popl %edi  ; X86-NEXT:    popl %ebx -; X86-NEXT:    popl %ebp  ; X86-NEXT:    retl $4  ; -; SSE-LABEL: extractSub512_128: -; SSE:       # %bb.0: -; SSE-NEXT:    pushq %rax -; SSE-NEXT:    # kill: def $esi killed $esi def $rsi -; SSE-NEXT:    movups (%rdi), %xmm0 -; SSE-NEXT:    movups 16(%rdi), %xmm1 -; SSE-NEXT:    movups 32(%rdi), %xmm2 -; SSE-NEXT:    movups 48(%rdi), %xmm3 -; SSE-NEXT:    xorps %xmm4, %xmm4 -; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    shrl $3, %esi -; SSE-NEXT:    andl $48, %esi -; SSE-NEXT:    movq -128(%rsp,%rsi), %rax -; SSE-NEXT:    movq -120(%rsp,%rsi), %rdx -; SSE-NEXT:    popq %rcx -; SSE-NEXT:    retq -; -; AVX-LABEL: extractSub512_128: -; AVX:       # %bb.0: -; AVX-NEXT:    pushq %rax -; AVX-NEXT:    # kill: def $esi killed $esi def $rsi -; AVX-NEXT:    vmovups (%rdi), %ymm0 -; AVX-NEXT:    vmovups 32(%rdi), %ymm1 -; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp) -; AVX-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp) -; AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp) -; AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT:    shrl $3, %esi -; AVX-NEXT:    andl $48, %esi -; AVX-NEXT:    movq -128(%rsp,%rsi), %rax -; AVX-NEXT:    movq -120(%rsp,%rsi), %rdx -; AVX-NEXT:    popq %rcx -; AVX-NEXT:    vzeroupper -; AVX-NEXT:    retq +; X64-LABEL: extractSub512_128: +; X64:       # %bb.0: +; X64-NEXT:    # kill: def $esi killed $esi def $rsi +; X64-NEXT:    shrl $3, %esi +; X64-NEXT:    andl $48, %esi +; X64-NEXT:    movq (%rdi,%rsi), %rax +; X64-NEXT:    movq 8(%rdi,%rsi), %rdx +; X64-NEXT:    retq    %idx_bounds = and i32 %idx, 511    %idx_align = and i32 %idx_bounds, -128    %sh = zext nneg i32 %idx_align to i512 @@ -752,916 +211,21 @@ define i128 @extractSub512_128(ptr %word, i32 %idx) nounwind {  define i64 @extractSub4096_64(ptr %word, i32 %idx) nounwind {  ; X86-LABEL: extractSub4096_64:  ; X86:       # %bb.0: -; X86-NEXT:    pushl %ebp -; X86-NEXT:    movl %esp, %ebp -; X86-NEXT:    pushl %ebx -; X86-NEXT:    pushl %edi -; X86-NEXT:    pushl %esi -; X86-NEXT:    andl $-16, %esp -; X86-NEXT:    subl $1536, %esp # imm = 0x600 -; X86-NEXT:    movl 8(%ebp), %eax -; X86-NEXT:    movl 4(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 8(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 12(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 16(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 20(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 24(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 28(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 32(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 36(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 40(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 44(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 48(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 52(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 56(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 60(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 64(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 68(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 72(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 76(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 80(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 84(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 88(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 92(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 96(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 100(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 104(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 108(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 112(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 116(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 120(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 124(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 128(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 132(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 136(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 140(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 144(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 148(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 152(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 156(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 160(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 164(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 168(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 172(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 176(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 180(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 184(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 188(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 192(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 196(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 200(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 204(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 208(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 212(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 216(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 220(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 224(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 228(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 232(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 236(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 240(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 244(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 248(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 252(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 256(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 260(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 264(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 268(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 272(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 276(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 280(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 284(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 288(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 292(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 296(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 300(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 304(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 308(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 312(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 316(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 320(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 324(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 328(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 332(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 336(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 340(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 344(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 348(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 352(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 356(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 360(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 364(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 368(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 372(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 376(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 380(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl (%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 384(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 388(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 392(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 396(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 400(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 404(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 408(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 412(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 416(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 420(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 424(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 428(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 432(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 436(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 440(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 444(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 448(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 452(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 456(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 460(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 464(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 468(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 472(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 476(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 480(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 484(%eax), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 488(%eax), %ebx -; X86-NEXT:    movl 492(%eax), %edi -; X86-NEXT:    movl 496(%eax), %esi -; X86-NEXT:    movl 500(%eax), %edx -; X86-NEXT:    movl 504(%eax), %ecx -; X86-NEXT:    movl 508(%eax), %eax -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $4032, %ecx # imm = 0xFC0 -; X86-NEXT:    andl 12(%ebp), %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    shrl $3, %ecx -; X86-NEXT:    movl 496(%esp,%ecx), %eax -; X86-NEXT:    movl 500(%esp,%ecx), %edx -; X86-NEXT:    leal -12(%ebp), %esp -; X86-NEXT:    popl %esi -; X86-NEXT:    popl %edi -; X86-NEXT:    popl %ebx -; X86-NEXT:    popl %ebp +; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT:    movl $4032, %edx # imm = 0xFC0 +; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx +; X86-NEXT:    shrl $3, %edx +; X86-NEXT:    movl (%ecx,%edx), %eax +; X86-NEXT:    movl 4(%ecx,%edx), %edx  ; X86-NEXT:    retl  ; -; SSE-LABEL: extractSub4096_64: -; SSE:       # %bb.0: -; SSE-NEXT:    subq $1176, %rsp # imm = 0x498 -; SSE-NEXT:    # kill: def $esi killed $esi def $rsi -; SSE-NEXT:    movups (%rdi), %xmm0 -; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT:    movups 16(%rdi), %xmm0 -; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT:    movups 32(%rdi), %xmm0 -; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT:    movups 48(%rdi), %xmm0 -; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT:    movups 64(%rdi), %xmm0 -; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT:    movups 80(%rdi), %xmm0 -; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT:    movups 96(%rdi), %xmm0 -; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT:    movups 112(%rdi), %xmm0 -; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT:    movups 128(%rdi), %xmm0 -; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT:    movups 144(%rdi), %xmm0 -; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT:    movups 160(%rdi), %xmm0 -; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT:    movups 176(%rdi), %xmm0 -; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT:    movups 192(%rdi), %xmm0 -; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT:    movups 208(%rdi), %xmm0 -; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT:    movups 224(%rdi), %xmm0 -; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT:    movups 240(%rdi), %xmm0 -; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT:    movups 256(%rdi), %xmm0 -; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT:    movups 272(%rdi), %xmm15 -; SSE-NEXT:    movups 288(%rdi), %xmm14 -; SSE-NEXT:    movups 304(%rdi), %xmm13 -; SSE-NEXT:    movups 320(%rdi), %xmm12 -; SSE-NEXT:    movups 336(%rdi), %xmm11 -; SSE-NEXT:    movups 352(%rdi), %xmm10 -; SSE-NEXT:    movups 368(%rdi), %xmm9 -; SSE-NEXT:    movups 384(%rdi), %xmm8 -; SSE-NEXT:    movups 400(%rdi), %xmm7 -; SSE-NEXT:    movups 416(%rdi), %xmm6 -; SSE-NEXT:    movups 432(%rdi), %xmm5 -; SSE-NEXT:    movups 448(%rdi), %xmm4 -; SSE-NEXT:    movups 464(%rdi), %xmm3 -; SSE-NEXT:    movups 480(%rdi), %xmm2 -; SSE-NEXT:    movups 496(%rdi), %xmm1 -; SSE-NEXT:    xorps %xmm0, %xmm0 -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm5, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm6, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm7, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm8, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm9, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm10, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm11, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm12, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm13, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm14, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm15, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    andl $4032, %esi # imm = 0xFC0 -; SSE-NEXT:    shrl $3, %esi -; SSE-NEXT:    movq 144(%rsp,%rsi), %rax -; SSE-NEXT:    addq $1176, %rsp # imm = 0x498 -; SSE-NEXT:    retq -; -; AVX2-LABEL: extractSub4096_64: -; AVX2:       # %bb.0: -; AVX2-NEXT:    subq $936, %rsp # imm = 0x3A8 -; AVX2-NEXT:    vmovups (%rdi), %ymm0 -; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT:    vmovups 32(%rdi), %ymm1 -; AVX2-NEXT:    vmovups 64(%rdi), %ymm2 -; AVX2-NEXT:    vmovups 96(%rdi), %ymm3 -; AVX2-NEXT:    vmovups 128(%rdi), %ymm4 -; AVX2-NEXT:    vmovups 160(%rdi), %ymm5 -; AVX2-NEXT:    vmovups 192(%rdi), %ymm6 -; AVX2-NEXT:    vmovups 224(%rdi), %ymm7 -; AVX2-NEXT:    vmovups 256(%rdi), %ymm8 -; AVX2-NEXT:    vmovups 288(%rdi), %ymm9 -; AVX2-NEXT:    vmovups 320(%rdi), %ymm10 -; AVX2-NEXT:    vmovups 352(%rdi), %ymm11 -; AVX2-NEXT:    vmovups 384(%rdi), %ymm12 -; AVX2-NEXT:    vmovups 416(%rdi), %ymm13 -; AVX2-NEXT:    vmovups 448(%rdi), %ymm14 -; AVX2-NEXT:    vmovups 480(%rdi), %ymm15 -; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm15, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm14, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm13, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm12, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm11, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm10, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm9, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm8, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm7, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm6, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm5, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm4, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm3, (%rsp) -; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT:    # kill: def $esi killed $esi def $rsi -; AVX2-NEXT:    andl $4032, %esi # imm = 0xFC0 -; AVX2-NEXT:    shrl $3, %esi -; AVX2-NEXT:    movq -96(%rsp,%rsi), %rax -; AVX2-NEXT:    addq $936, %rsp # imm = 0x3A8 -; AVX2-NEXT:    vzeroupper -; AVX2-NEXT:    retq -; -; AVX512-LABEL: extractSub4096_64: -; AVX512:       # %bb.0: -; AVX512-NEXT:    subq $904, %rsp # imm = 0x388 -; AVX512-NEXT:    # kill: def $esi killed $esi def $rsi -; AVX512-NEXT:    vmovups (%rdi), %ymm0 -; AVX512-NEXT:    vmovups 32(%rdi), %ymm1 -; AVX512-NEXT:    vmovups 64(%rdi), %ymm2 -; AVX512-NEXT:    vmovups 96(%rdi), %ymm3 -; AVX512-NEXT:    vmovups 128(%rdi), %ymm4 -; AVX512-NEXT:    vmovups 160(%rdi), %ymm5 -; AVX512-NEXT:    vmovups 192(%rdi), %ymm6 -; AVX512-NEXT:    vmovups 224(%rdi), %ymm7 -; AVX512-NEXT:    vmovups 256(%rdi), %ymm8 -; AVX512-NEXT:    vmovups 288(%rdi), %ymm9 -; AVX512-NEXT:    vmovups 320(%rdi), %ymm10 -; AVX512-NEXT:    vmovups 352(%rdi), %ymm11 -; AVX512-NEXT:    vmovups 384(%rdi), %ymm12 -; AVX512-NEXT:    vmovups 416(%rdi), %ymm13 -; AVX512-NEXT:    andl $4032, %esi # imm = 0xFC0 -; AVX512-NEXT:    vmovups 448(%rdi), %ymm14 -; AVX512-NEXT:    vmovups 480(%rdi), %ymm15 -; AVX512-NEXT:    vxorps %xmm16, %xmm16, %xmm16 -; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm15, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm14, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm13, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm12, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm11, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm10, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm9, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm8, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm7, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm6, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm5, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm4, (%rsp) -; AVX512-NEXT:    vmovups %ymm3, -{{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT:    shrl $3, %esi -; AVX512-NEXT:    movq -128(%rsp,%rsi), %rax -; AVX512-NEXT:    addq $904, %rsp # imm = 0x388 -; AVX512-NEXT:    vzeroupper -; AVX512-NEXT:    retq +; X64-LABEL: extractSub4096_64: +; X64:       # %bb.0: +; X64-NEXT:    # kill: def $esi killed $esi def $rsi +; X64-NEXT:    andl $4032, %esi # imm = 0xFC0 +; X64-NEXT:    shrl $3, %esi +; X64-NEXT:    movq (%rdi,%rsi), %rax +; X64-NEXT:    retq    %idx_bounds = and i32 %idx, 4095    %idx_align = and i32 %idx_bounds, -64    %sh = zext nneg i32 %idx_align to i4096 diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll index ab1feba..9816fa7 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll @@ -992,6 +992,51 @@ define i1 @signtest_v8i32(<8 x i32> %a0) {    ret i1 %2  } +define i1 @signtest_v4i64(<4 x i64> %a0) { +; SSE2-LABEL: signtest_v4i64: +; SSE2:       # %bb.0: +; SSE2-NEXT:    por %xmm1, %xmm0 +; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE2-NEXT:    por %xmm0, %xmm1 +; SSE2-NEXT:    movq %xmm1, %rax +; SSE2-NEXT:    testq %rax, %rax +; SSE2-NEXT:    setns %al +; SSE2-NEXT:    retq +; +; SSE41-LABEL: signtest_v4i64: +; SSE41:       # %bb.0: +; SSE41-NEXT:    por %xmm1, %xmm0 +; SSE41-NEXT:    ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT:    sete %al +; SSE41-NEXT:    retq +; +; AVX1-LABEL: signtest_v4i64: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT:    sete %al +; AVX1-NEXT:    vzeroupper +; AVX1-NEXT:    retq +; +; AVX2-LABEL: signtest_v4i64: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT:    vptest %ymm1, %ymm0 +; AVX2-NEXT:    sete %al +; AVX2-NEXT:    vzeroupper +; AVX2-NEXT:    retq +; +; AVX512-LABEL: signtest_v4i64: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX512-NEXT:    vptest %ymm1, %ymm0 +; AVX512-NEXT:    sete %al +; AVX512-NEXT:    vzeroupper +; AVX512-NEXT:    retq +  %1 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %a0) +  %2 = icmp sgt i64 %1, -1 +  ret i1 %2 +} +  define i1 @trunc_v16i16(<16 x i16> %a0) {  ; SSE2-LABEL: trunc_v16i16:  ; SSE2:       # %bb.0: @@ -1162,11 +1207,11 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {  ; SSE2-NEXT:    movd %xmm0, %eax  ; SSE2-NEXT:    orl %ecx, %eax  ; SSE2-NEXT:    testb $1, %al -; SSE2-NEXT:    je .LBB29_2 +; SSE2-NEXT:    je .LBB30_2  ; SSE2-NEXT:  # %bb.1:  ; SSE2-NEXT:    xorl %eax, %eax  ; SSE2-NEXT:    retq -; SSE2-NEXT:  .LBB29_2: +; SSE2-NEXT:  .LBB30_2:  ; SSE2-NEXT:    movl $1, %eax  ; SSE2-NEXT:    retq  ; @@ -1181,11 +1226,11 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {  ; SSE41-NEXT:    pextrd $2, %xmm1, %eax  ; SSE41-NEXT:    orl %ecx, %eax  ; SSE41-NEXT:    testb $1, %al -; SSE41-NEXT:    je .LBB29_2 +; SSE41-NEXT:    je .LBB30_2  ; SSE41-NEXT:  # %bb.1:  ; SSE41-NEXT:    xorl %eax, %eax  ; SSE41-NEXT:    retq -; SSE41-NEXT:  .LBB29_2: +; SSE41-NEXT:  .LBB30_2:  ; SSE41-NEXT:    movl $1, %eax  ; SSE41-NEXT:    retq  ; @@ -1200,11 +1245,11 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {  ; AVX1OR2-NEXT:    vpextrd $2, %xmm0, %eax  ; AVX1OR2-NEXT:    orl %ecx, %eax  ; AVX1OR2-NEXT:    testb $1, %al -; AVX1OR2-NEXT:    je .LBB29_2 +; AVX1OR2-NEXT:    je .LBB30_2  ; AVX1OR2-NEXT:  # %bb.1:  ; AVX1OR2-NEXT:    xorl %eax, %eax  ; AVX1OR2-NEXT:    retq -; AVX1OR2-NEXT:  .LBB29_2: +; AVX1OR2-NEXT:  .LBB30_2:  ; AVX1OR2-NEXT:    movl $1, %eax  ; AVX1OR2-NEXT:    retq  ; @@ -1219,12 +1264,12 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {  ; AVX512F-NEXT:    korw %k0, %k1, %k0  ; AVX512F-NEXT:    kmovw %k0, %eax  ; AVX512F-NEXT:    testb $1, %al -; AVX512F-NEXT:    je .LBB29_2 +; AVX512F-NEXT:    je .LBB30_2  ; AVX512F-NEXT:  # %bb.1:  ; AVX512F-NEXT:    xorl %eax, %eax  ; AVX512F-NEXT:    vzeroupper  ; AVX512F-NEXT:    retq -; AVX512F-NEXT:  .LBB29_2: +; AVX512F-NEXT:  .LBB30_2:  ; AVX512F-NEXT:    movl $1, %eax  ; AVX512F-NEXT:    vzeroupper  ; AVX512F-NEXT:    retq @@ -1240,12 +1285,12 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {  ; AVX512BW-NEXT:    korw %k0, %k1, %k0  ; AVX512BW-NEXT:    kmovd %k0, %eax  ; AVX512BW-NEXT:    testb $1, %al -; AVX512BW-NEXT:    je .LBB29_2 +; AVX512BW-NEXT:    je .LBB30_2  ; AVX512BW-NEXT:  # %bb.1:  ; AVX512BW-NEXT:    xorl %eax, %eax  ; AVX512BW-NEXT:    vzeroupper  ; AVX512BW-NEXT:    retq -; AVX512BW-NEXT:  .LBB29_2: +; AVX512BW-NEXT:  .LBB30_2:  ; AVX512BW-NEXT:    movl $1, %eax  ; AVX512BW-NEXT:    vzeroupper  ; AVX512BW-NEXT:    retq @@ -1259,11 +1304,11 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {  ; AVX512BWVL-NEXT:    korw %k0, %k1, %k0  ; AVX512BWVL-NEXT:    kmovd %k0, %eax  ; AVX512BWVL-NEXT:    testb $1, %al -; AVX512BWVL-NEXT:    je .LBB29_2 +; AVX512BWVL-NEXT:    je .LBB30_2  ; AVX512BWVL-NEXT:  # %bb.1:  ; AVX512BWVL-NEXT:    xorl %eax, %eax  ; AVX512BWVL-NEXT:    retq -; AVX512BWVL-NEXT:  .LBB29_2: +; AVX512BWVL-NEXT:  .LBB30_2:  ; AVX512BWVL-NEXT:    movl $1, %eax  ; AVX512BWVL-NEXT:    retq    %1 = icmp ne <3 x i32> %a, %b diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll index 81c4d5d..c3054a3 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll @@ -962,39 +962,22 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6  }  define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind { -; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half: -; X64-NO-BMI2:       # %bb.0: -; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0 -; X64-NO-BMI2-NEXT:    xorps %xmm1, %xmm1 -; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx -; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT:    movl %ecx, %eax -; X64-NO-BMI2-NEXT:    shrb $6, %al -; X64-NO-BMI2-NEXT:    movzbl %al, %eax -; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rax -; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx -; X64-NO-BMI2-NEXT:    shrq %cl, %rax -; X64-NO-BMI2-NEXT:    movb %al, (%rdx) -; X64-NO-BMI2-NEXT:    retq -; -; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half: -; X64-BMI2:       # %bb.0: -; X64-BMI2-NEXT:    movups (%rdi), %xmm0 -; X64-BMI2-NEXT:    xorps %xmm1, %xmm1 -; X64-BMI2-NEXT:    shll $3, %esi -; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT:    movl %esi, %eax -; X64-BMI2-NEXT:    shrb $6, %al -; X64-BMI2-NEXT:    movzbl %al, %eax -; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rax -; X64-BMI2-NEXT:    movb %al, (%rdx) -; X64-BMI2-NEXT:    retq +; X64-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X64:       # %bb.0: +; X64-NEXT:    movups (%rdi), %xmm0 +; X64-NEXT:    xorps %xmm1, %xmm1 +; X64-NEXT:    leal (,%rsi,8), %eax +; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT:    shrb $6, %al +; X64-NEXT:    movzbl %al, %eax +; X64-NEXT:    leaq -72(%rsp,%rax,8), %rax +; X64-NEXT:    andl $7, %esi +; X64-NEXT:    movzbl (%rsi,%rax), %eax +; X64-NEXT:    movb %al, (%rdx) +; X64-NEXT:    retq  ;  ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:  ; X86-NO-BMI2-NO-SHLD:       # %bb.0: @@ -3417,7 +3400,6 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i  }  ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:  ; ALL: {{.*}} -; X64: {{.*}}  ; X64-NO-SHLD: {{.*}}  ; X86: {{.*}}  ; X86-HAVE-BMI2-HAVE-SHLD: {{.*}} diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll index 8d36eef..84c2cc6 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll @@ -1220,41 +1220,23 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)  ; no @load_16byte_chunk_of_16byte_alloca  define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { -; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca: -; X64-NO-BMI2:       # %bb.0: -; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0 -; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1 -; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx -; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2 -; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT:    movl %ecx, %eax -; X64-NO-BMI2-NEXT:    shrb $6, %al -; X64-NO-BMI2-NEXT:    movzbl %al, %eax -; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rax -; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx -; X64-NO-BMI2-NEXT:    shrq %cl, %rax -; X64-NO-BMI2-NEXT:    movb %al, (%rdx) -; X64-NO-BMI2-NEXT:    retq -; -; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca: -; X64-BMI2:       # %bb.0: -; X64-BMI2-NEXT:    movups (%rdi), %xmm0 -; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1 -; X64-BMI2-NEXT:    shll $3, %esi -; X64-BMI2-NEXT:    xorps %xmm2, %xmm2 -; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-BMI2-NEXT:    movl %esi, %eax -; X64-BMI2-NEXT:    shrb $6, %al -; X64-BMI2-NEXT:    movzbl %al, %eax -; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rax -; X64-BMI2-NEXT:    movb %al, (%rdx) -; X64-BMI2-NEXT:    retq +; X64-LABEL: load_1byte_chunk_of_32byte_alloca: +; X64:       # %bb.0: +; X64-NEXT:    movups (%rdi), %xmm0 +; X64-NEXT:    movups 16(%rdi), %xmm1 +; X64-NEXT:    leal (,%rsi,8), %eax +; X64-NEXT:    xorps %xmm2, %xmm2 +; X64-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT:    shrb $6, %al +; X64-NEXT:    movzbl %al, %eax +; X64-NEXT:    leaq -72(%rsp,%rax,8), %rax +; X64-NEXT:    andl $7, %esi +; X64-NEXT:    movzbl (%rsi,%rax), %eax +; X64-NEXT:    movb %al, (%rdx) +; X64-NEXT:    retq  ;  ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:  ; X86-NO-BMI2-NO-SHLD:       # %bb.0: @@ -2156,7 +2138,6 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst  ; no @load_32byte_chunk_of_32byte_alloca  ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:  ; ALL: {{.*}} -; X64: {{.*}}  ; X64-NO-SHLD: {{.*}}  ; X86: {{.*}}  ; X86-NO-SHLD: {{.*}} | 
