diff options
author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2024-03-27 17:01:41 +0000 |
---|---|---|
committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2024-03-27 17:01:41 +0000 |
commit | dcd0f2b6103072b74b446c2d1e9ecec60001a28c (patch) | |
tree | c2adc9a134935b6f19bdfe89b23dc694d1574d13 | |
parent | c335accb07c0cfa4bd7f47edc94c9005692edfcc (diff) | |
download | llvm-dcd0f2b6103072b74b446c2d1e9ecec60001a28c.zip llvm-dcd0f2b6103072b74b446c2d1e9ecec60001a28c.tar.gz llvm-dcd0f2b6103072b74b446c2d1e9ecec60001a28c.tar.bz2 |
[X86] combineExtractFromVectorLoad support extraction from vector of different types to the extraction type/index
combineExtractFromVectorLoad no longer uses the vector we're extracting from to determine the pointer offset calculation, allowing us to extract from types that have been bitcast to work with specific target shuffles.
Fixes #85419
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 23 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/extractelement-load.ll | 35 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/pr45378.ll | 40 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/setcc-non-simple-type.ll | 36 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/var-permute-128.ll | 32 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vec_int_to_fp.ll | 305 |
6 files changed, 179 insertions, 292 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a229f6e..9d98d31 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -43999,18 +43999,18 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, // integer, that requires a potentially expensive XMM -> GPR transfer. // Additionally, if we can convert to a scalar integer load, that will likely // be folded into a subsequent integer op. +// Note: SrcVec might not have a VecVT type, but it must be the same size. // Note: Unlike the related fold for this in DAGCombiner, this is not limited // to a single-use of the loaded vector. For the reasons above, we // expect this to be profitable even if it creates an extra load. static SDValue -combineExtractFromVectorLoad(SDNode *N, SDValue InputVector, uint64_t Idx, +combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Only EXTRACT_VECTOR_ELT supported so far"); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - EVT SrcVT = InputVector.getValueType(); EVT VT = N->getValueType(0); bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) { @@ -44019,12 +44019,13 @@ combineExtractFromVectorLoad(SDNode *N, SDValue InputVector, uint64_t Idx, Use->getOpcode() == ISD::SCALAR_TO_VECTOR; }); - auto *LoadVec = dyn_cast<LoadSDNode>(InputVector); + auto *LoadVec = dyn_cast<LoadSDNode>(SrcVec); if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() && - SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() && - !LikelyUsedAsVector && LoadVec->isSimple()) { + VecVT.getVectorElementType() == VT && + VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() && + DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) { SDValue NewPtr = TLI.getVectorElementPointer( - DAG, LoadVec->getBasePtr(), SrcVT, DAG.getVectorIdxConstant(Idx, dl)); + DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl)); unsigned PtrOff = VT.getSizeInBits() * Idx / 8; MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff); Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff); @@ -44234,10 +44235,9 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx)) return DAG.getZExtOrTrunc(V, dl, VT); - if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT && - SrcOp.getValueType() == SrcVT) - if (SDValue V = - combineExtractFromVectorLoad(N, SrcOp, ExtractIdx, dl, DAG, DCI)) + if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT) + if (SDValue V = combineExtractFromVectorLoad( + N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI)) return V; return SDValue(); @@ -44651,7 +44651,8 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, if (CIdx) if (SDValue V = combineExtractFromVectorLoad( - N, InputVector, CIdx->getZExtValue(), dl, DAG, DCI)) + N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(), + dl, DAG, DCI)) return V; // Attempt to extract a i1 element by using MOVMSK to extract the signbits diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll index ba2217f..022b25a 100644 --- a/llvm/test/CodeGen/X86/extractelement-load.ll +++ b/llvm/test/CodeGen/X86/extractelement-load.ll @@ -76,11 +76,9 @@ bb: define i64 @t4(ptr %a) { ; X86-SSE2-LABEL: t4: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqa (%eax), %xmm0 -; X86-SSE2-NEXT: movd %xmm0, %eax -; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movl (%ecx), %eax +; X86-SSE2-NEXT: movl 4(%ecx), %edx ; X86-SSE2-NEXT: retl ; ; X64-LABEL: t4: @@ -289,24 +287,15 @@ define i32 @PR85419(ptr %p0) { ; X86-SSE2-NEXT: .LBB8_2: ; X86-SSE2-NEXT: retl ; -; X64-SSSE3-LABEL: PR85419: -; X64-SSSE3: # %bb.0: -; X64-SSSE3-NEXT: xorl %ecx, %ecx -; X64-SSSE3-NEXT: cmpq $0, (%rdi) -; X64-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = mem[2,3,2,3] -; X64-SSSE3-NEXT: movd %xmm0, %eax -; X64-SSSE3-NEXT: cmovel %ecx, %eax -; X64-SSSE3-NEXT: retq -; -; X64-AVX-LABEL: PR85419: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: xorl %eax, %eax -; X64-AVX-NEXT: cmpq $0, (%rdi) -; X64-AVX-NEXT: je .LBB8_2 -; X64-AVX-NEXT: # %bb.1: -; X64-AVX-NEXT: movl 8(%rdi), %eax -; X64-AVX-NEXT: .LBB8_2: -; X64-AVX-NEXT: retq +; X64-LABEL: PR85419: +; X64: # %bb.0: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq $0, (%rdi) +; X64-NEXT: je .LBB8_2 +; X64-NEXT: # %bb.1: +; X64-NEXT: movl 8(%rdi), %eax +; X64-NEXT: .LBB8_2: +; X64-NEXT: retq %load = load <2 x i64>, ptr %p0, align 16 %vecext.i = extractelement <2 x i64> %load, i64 0 %cmp = icmp eq i64 %vecext.i, 0 diff --git a/llvm/test/CodeGen/X86/pr45378.ll b/llvm/test/CodeGen/X86/pr45378.ll index 426f4ee..6a5770a 100644 --- a/llvm/test/CodeGen/X86/pr45378.ll +++ b/llvm/test/CodeGen/X86/pr45378.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefix=SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefixes=CHECK,AVX declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>) @@ -71,28 +71,12 @@ define i1 @parseHeaders2_scalar_or(ptr %ptr) nounwind { } define i1 @parseHeaders2_scalar_and(ptr %ptr) nounwind { -; SSE2-LABEL: parseHeaders2_scalar_and: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqu (%rdi), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: testq %rax, (%rdi) -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: parseHeaders2_scalar_and: -; SSE41: # %bb.0: -; SSE41-NEXT: movq (%rdi), %rax -; SSE41-NEXT: testq %rax, 8(%rdi) -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq -; -; AVX-LABEL: parseHeaders2_scalar_and: -; AVX: # %bb.0: -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: testq %rax, 8(%rdi) -; AVX-NEXT: sete %al -; AVX-NEXT: retq +; CHECK-LABEL: parseHeaders2_scalar_and: +; CHECK: # %bb.0: +; CHECK-NEXT: movq (%rdi), %rax +; CHECK-NEXT: testq %rax, 8(%rdi) +; CHECK-NEXT: sete %al +; CHECK-NEXT: retq %vload = load <2 x i64>, ptr %ptr, align 8 %v1 = extractelement <2 x i64> %vload, i32 0 %v2 = extractelement <2 x i64> %vload, i32 1 diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll index 2187c65..97c3c204 100644 --- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll +++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll @@ -60,36 +60,30 @@ define void @failing(ptr %0, ptr %1) nounwind { ; CHECK-NEXT: .LBB0_2: # %vector.body ; CHECK-NEXT: # Parent Loop BB0_1 Depth=1 ; CHECK-NEXT: # => This Inner Loop Header: Depth=2 -; CHECK-NEXT: movdqu 1024(%rdx,%rdi), %xmm5 -; CHECK-NEXT: movdqu 1040(%rdx,%rdi), %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; CHECK-NEXT: movq %xmm5, %r8 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] -; CHECK-NEXT: movq %xmm5, %r9 -; CHECK-NEXT: cmpq 1040(%rdx,%rdi), %rsi -; CHECK-NEXT: movq %rcx, %r10 -; CHECK-NEXT: sbbq %r9, %r10 -; CHECK-NEXT: setge %r9b -; CHECK-NEXT: movzbl %r9b, %r9d -; CHECK-NEXT: andl $1, %r9d -; CHECK-NEXT: negq %r9 -; CHECK-NEXT: movq %r9, %xmm5 ; CHECK-NEXT: cmpq 1024(%rdx,%rdi), %rsi -; CHECK-NEXT: movq %rcx, %r9 -; CHECK-NEXT: sbbq %r8, %r9 +; CHECK-NEXT: movq %rcx, %r8 +; CHECK-NEXT: sbbq 1032(%rdx,%rdi), %r8 +; CHECK-NEXT: setge %r8b +; CHECK-NEXT: movzbl %r8b, %r8d +; CHECK-NEXT: andl $1, %r8d +; CHECK-NEXT: negq %r8 +; CHECK-NEXT: movq %r8, %xmm5 +; CHECK-NEXT: cmpq 1040(%rdx,%rdi), %rsi +; CHECK-NEXT: movq %rcx, %r8 +; CHECK-NEXT: sbbq 1048(%rdx,%rdi), %r8 ; CHECK-NEXT: setge %r8b ; CHECK-NEXT: movzbl %r8b, %r8d ; CHECK-NEXT: andl $1, %r8d ; CHECK-NEXT: negq %r8 ; CHECK-NEXT: movq %r8, %xmm6 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0] -; CHECK-NEXT: movdqa %xmm1, %xmm5 -; CHECK-NEXT: psllq %xmm4, %xmm5 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; CHECK-NEXT: movdqa %xmm1, %xmm6 +; CHECK-NEXT: psllq %xmm4, %xmm6 ; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3] ; CHECK-NEXT: movdqa %xmm1, %xmm8 ; CHECK-NEXT: psllq %xmm7, %xmm8 -; CHECK-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1] -; CHECK-NEXT: andpd %xmm6, %xmm8 +; CHECK-NEXT: movsd {{.*#+}} xmm8 = xmm6[0],xmm8[1] +; CHECK-NEXT: andpd %xmm5, %xmm8 ; CHECK-NEXT: orpd %xmm8, %xmm3 ; CHECK-NEXT: paddq %xmm2, %xmm4 ; CHECK-NEXT: addq $32, %rdi diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll index 99a3821..f2240a9 100644 --- a/llvm/test/CodeGen/X86/var-permute-128.ll +++ b/llvm/test/CodeGen/X86/var-permute-128.ll @@ -1101,17 +1101,13 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in define void @indices_convert() { ; SSE3-LABEL: indices_convert: ; SSE3: # %bb.0: # %bb -; SSE3-NEXT: movdqa (%rax), %xmm0 -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE3-NEXT: movd %xmm1, %eax -; SSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE3-NEXT: movaps (%rax), %xmm0 +; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE3-NEXT: movl (%rax), %eax +; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE3-NEXT: andl $3, %eax -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SSE3-NEXT: movd %xmm1, %ecx -; SSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE3-NEXT: andl $3, %ecx ; SSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -1120,17 +1116,13 @@ define void @indices_convert() { ; ; SSSE3-LABEL: indices_convert: ; SSSE3: # %bb.0: # %bb -; SSSE3-NEXT: movdqa (%rax), %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSSE3-NEXT: movd %xmm1, %eax -; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps (%rax), %xmm0 +; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movl (%rax), %eax +; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSSE3-NEXT: andl $3, %eax -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SSSE3-NEXT: movd %xmm1, %ecx -; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: andl $3, %ecx ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index 7bbcdee..e26de4b 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -2911,23 +2911,12 @@ define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) { ; define <2 x double> @sitofp_load_2i64_to_2f64(ptr%a) { -; SSE2-LABEL: sitofp_load_2i64_to_2f64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: cvtsi2sdq (%rdi), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2sd %rax, %xmm1 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_load_2i64_to_2f64: -; SSE41: # %bb.0: -; SSE41-NEXT: cvtsi2sdq 8(%rdi), %xmm1 -; SSE41-NEXT: cvtsi2sdq (%rdi), %xmm0 -; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: retq +; SSE-LABEL: sitofp_load_2i64_to_2f64: +; SSE: # %bb.0: +; SSE-NEXT: cvtsi2sdq 8(%rdi), %xmm1 +; SSE-NEXT: cvtsi2sdq (%rdi), %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq ; ; VEX-LABEL: sitofp_load_2i64_to_2f64: ; VEX: # %bb.0: @@ -3093,35 +3082,16 @@ define <2 x double> @sitofp_load_2i8_to_2f64(ptr%a) { } define <4 x double> @sitofp_load_4i64_to_4f64(ptr%a) { -; SSE2-LABEL: sitofp_load_4i64_to_4f64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: movdqa 16(%rdi), %xmm2 -; SSE2-NEXT: cvtsi2sdq (%rdi), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2sd %rax, %xmm1 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2sdq 16(%rdi), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; SSE2-NEXT: movq %xmm2, %rax -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: cvtsi2sd %rax, %xmm2 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_load_4i64_to_4f64: -; SSE41: # %bb.0: -; SSE41-NEXT: cvtsi2sdq 8(%rdi), %xmm1 -; SSE41-NEXT: cvtsi2sdq (%rdi), %xmm0 -; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: cvtsi2sdq 24(%rdi), %xmm2 -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: cvtsi2sdq 16(%rdi), %xmm1 -; SSE41-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE41-NEXT: retq +; SSE-LABEL: sitofp_load_4i64_to_4f64: +; SSE: # %bb.0: +; SSE-NEXT: cvtsi2sdq 8(%rdi), %xmm1 +; SSE-NEXT: cvtsi2sdq (%rdi), %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: cvtsi2sdq 24(%rdi), %xmm2 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: cvtsi2sdq 16(%rdi), %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: retq ; ; VEX-LABEL: sitofp_load_4i64_to_4f64: ; VEX: # %bb.0: @@ -3865,22 +3835,14 @@ define <4 x double> @uitofp_load_4i8_to_4f64(ptr%a) { define <4 x float> @sitofp_load_4i64_to_4f32(ptr%a) { ; SSE2-LABEL: sitofp_load_4i64_to_4f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: movdqa 16(%rdi), %xmm0 -; SSE2-NEXT: cvtsi2ssq 16(%rdi), %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: cvtsi2ssq 24(%rdi), %xmm0 +; SSE2-NEXT: cvtsi2ssq 16(%rdi), %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: cvtsi2ssq 8(%rdi), %xmm2 ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2ssq (%rdi), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; ; SSE41-LABEL: sitofp_load_4i64_to_4f32: @@ -4015,39 +3977,24 @@ define <4 x float> @sitofp_load_4i8_to_4f32(ptr%a) { define <8 x float> @sitofp_load_8i64_to_8f32(ptr%a) { ; SSE2-LABEL: sitofp_load_8i64_to_8f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: movdqa 16(%rdi), %xmm0 -; SSE2-NEXT: movdqa 32(%rdi), %xmm2 -; SSE2-NEXT: movdqa 48(%rdi), %xmm3 -; SSE2-NEXT: cvtsi2ssq 16(%rdi), %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE2-NEXT: cvtsi2ssq 24(%rdi), %xmm0 +; SSE2-NEXT: cvtsi2ssq 16(%rdi), %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: cvtsi2ssq 8(%rdi), %xmm2 ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2ssq (%rdi), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE2-NEXT: xorps %xmm4, %xmm4 -; SSE2-NEXT: cvtsi2ssq 48(%rdi), %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE2-NEXT: cvtsi2ssq 56(%rdi), %xmm1 +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: cvtsi2ssq 48(%rdi), %xmm2 +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: cvtsi2ssq 40(%rdi), %xmm3 ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2ssq 32(%rdi), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; SSE2-NEXT: movq %xmm2, %rax -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: cvtsi2ss %rax, %xmm2 -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE2-NEXT: retq ; ; SSE41-LABEL: sitofp_load_8i64_to_8f32: @@ -4256,70 +4203,64 @@ define <8 x float> @sitofp_load_8i8_to_8f32(ptr%a) { define <4 x float> @uitofp_load_4i64_to_4f32(ptr%a) { ; SSE2-LABEL: uitofp_load_4i64_to_4f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa 16(%rdi), %xmm0 -; SSE2-NEXT: movq 16(%rdi), %rax +; SSE2-NEXT: movq 24(%rdi), %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB83_1 ; SSE2-NEXT: # %bb.2: -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 ; SSE2-NEXT: jmp .LBB83_3 ; SSE2-NEXT: .LBB83_1: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: addss %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 +; SSE2-NEXT: addss %xmm0, %xmm0 ; SSE2-NEXT: .LBB83_3: -; SSE2-NEXT: movq (%rdi), %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: testq %rcx, %rcx +; SSE2-NEXT: movq 16(%rdi), %rax +; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB83_4 ; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: cvtsi2ss %rcx, %xmm2 +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 ; SSE2-NEXT: jmp .LBB83_6 ; SSE2-NEXT: .LBB83_4: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 +; SSE2-NEXT: addss %xmm1, %xmm1 +; SSE2-NEXT: .LBB83_6: +; SSE2-NEXT: movq (%rdi), %rax +; SSE2-NEXT: movq 8(%rdi), %rcx +; SSE2-NEXT: testq %rcx, %rcx +; SSE2-NEXT: js .LBB83_7 +; SSE2-NEXT: # %bb.8: +; SSE2-NEXT: cvtsi2ss %rcx, %xmm2 +; SSE2-NEXT: jmp .LBB83_9 +; SSE2-NEXT: .LBB83_7: ; SSE2-NEXT: movq %rcx, %rdx ; SSE2-NEXT: shrq %rdx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: orq %rdx, %rcx ; SSE2-NEXT: cvtsi2ss %rcx, %xmm2 ; SSE2-NEXT: addss %xmm2, %xmm2 -; SSE2-NEXT: .LBB83_6: -; SSE2-NEXT: movdqa (%rdi), %xmm3 -; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB83_7 -; SSE2-NEXT: # %bb.8: -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: jmp .LBB83_9 -; SSE2-NEXT: .LBB83_7: -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: addss %xmm0, %xmm0 ; SSE2-NEXT: .LBB83_9: -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB83_10 ; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: cvtsi2ss %rax, %xmm2 +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 ; SSE2-NEXT: jmp .LBB83_12 ; SSE2-NEXT: .LBB83_10: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: cvtsi2ss %rax, %xmm2 -; SSE2-NEXT: addss %xmm2, %xmm2 +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 +; SSE2-NEXT: addss %xmm0, %xmm0 ; SSE2-NEXT: .LBB83_12: ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -4591,8 +4532,7 @@ define <4 x float> @uitofp_load_4i8_to_4f32(ptr%a) { define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) { ; SSE2-LABEL: uitofp_load_8i64_to_8f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa 16(%rdi), %xmm0 -; SSE2-NEXT: movq 16(%rdi), %rax +; SSE2-NEXT: movq 24(%rdi), %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_1 ; SSE2-NEXT: # %bb.2: @@ -4606,127 +4546,114 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) { ; SSE2-NEXT: cvtsi2ss %rax, %xmm2 ; SSE2-NEXT: addss %xmm2, %xmm2 ; SSE2-NEXT: .LBB87_3: -; SSE2-NEXT: movq (%rdi), %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: testq %rcx, %rcx +; SSE2-NEXT: movq 16(%rdi), %rax +; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_4 ; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: cvtsi2ss %rcx, %xmm1 +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 ; SSE2-NEXT: jmp .LBB87_6 ; SSE2-NEXT: .LBB87_4: -; SSE2-NEXT: movq %rcx, %rdx -; SSE2-NEXT: shrq %rdx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: orq %rdx, %rcx -; SSE2-NEXT: cvtsi2ss %rcx, %xmm1 +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 ; SSE2-NEXT: addss %xmm1, %xmm1 ; SSE2-NEXT: .LBB87_6: -; SSE2-NEXT: movdqa (%rdi), %xmm3 -; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: movq (%rdi), %rax +; SSE2-NEXT: movq 8(%rdi), %rcx +; SSE2-NEXT: testq %rcx, %rcx ; SSE2-NEXT: js .LBB87_7 ; SSE2-NEXT: # %bb.8: -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: jmp .LBB87_9 -; SSE2-NEXT: .LBB87_7: +; SSE2-NEXT: cvtsi2ss %rcx, %xmm3 +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: jns .LBB87_11 +; SSE2-NEXT: .LBB87_10: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 ; SSE2-NEXT: addss %xmm0, %xmm0 -; SSE2-NEXT: .LBB87_9: -; SSE2-NEXT: movq 48(%rdi), %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; SSE2-NEXT: movq %xmm3, %rcx -; SSE2-NEXT: testq %rcx, %rcx -; SSE2-NEXT: js .LBB87_10 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: cvtsi2ss %rcx, %xmm4 ; SSE2-NEXT: jmp .LBB87_12 -; SSE2-NEXT: .LBB87_10: +; SSE2-NEXT: .LBB87_7: ; SSE2-NEXT: movq %rcx, %rdx ; SSE2-NEXT: shrq %rdx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: orq %rdx, %rcx -; SSE2-NEXT: cvtsi2ss %rcx, %xmm4 -; SSE2-NEXT: addss %xmm4, %xmm4 +; SSE2-NEXT: cvtsi2ss %rcx, %xmm3 +; SSE2-NEXT: addss %xmm3, %xmm3 +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB87_10 +; SSE2-NEXT: .LBB87_11: +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 ; SSE2-NEXT: .LBB87_12: -; SSE2-NEXT: movdqa 48(%rdi), %xmm5 +; SSE2-NEXT: movq 56(%rdi), %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_13 ; SSE2-NEXT: # %bb.14: -; SSE2-NEXT: xorps %xmm3, %xmm3 -; SSE2-NEXT: cvtsi2ss %rax, %xmm3 +; SSE2-NEXT: cvtsi2ss %rax, %xmm5 ; SSE2-NEXT: jmp .LBB87_15 ; SSE2-NEXT: .LBB87_13: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: xorps %xmm3, %xmm3 -; SSE2-NEXT: cvtsi2ss %rax, %xmm3 -; SSE2-NEXT: addss %xmm3, %xmm3 +; SSE2-NEXT: cvtsi2ss %rax, %xmm5 +; SSE2-NEXT: addss %xmm5, %xmm5 ; SSE2-NEXT: .LBB87_15: -; SSE2-NEXT: movq 32(%rdi), %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; SSE2-NEXT: movq %xmm5, %rcx -; SSE2-NEXT: testq %rcx, %rcx +; SSE2-NEXT: movq 48(%rdi), %rax +; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_16 ; SSE2-NEXT: # %bb.17: -; SSE2-NEXT: xorps %xmm5, %xmm5 -; SSE2-NEXT: cvtsi2ss %rcx, %xmm5 +; SSE2-NEXT: cvtsi2ss %rax, %xmm4 ; SSE2-NEXT: jmp .LBB87_18 ; SSE2-NEXT: .LBB87_16: -; SSE2-NEXT: movq %rcx, %rdx -; SSE2-NEXT: shrq %rdx -; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: orq %rdx, %rcx -; SSE2-NEXT: xorps %xmm5, %xmm5 -; SSE2-NEXT: cvtsi2ss %rcx, %xmm5 -; SSE2-NEXT: addss %xmm5, %xmm5 +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: cvtsi2ss %rax, %xmm4 +; SSE2-NEXT: addss %xmm4, %xmm4 ; SSE2-NEXT: .LBB87_18: -; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE2-NEXT: movdqa 32(%rdi), %xmm4 +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: movq 40(%rdi), %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_19 ; SSE2-NEXT: # %bb.20: -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: cvtsi2ss %rax, %xmm2 ; SSE2-NEXT: jmp .LBB87_21 ; SSE2-NEXT: .LBB87_19: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: addss %xmm1, %xmm1 +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: cvtsi2ss %rax, %xmm2 +; SSE2-NEXT: addss %xmm2, %xmm2 ; SSE2-NEXT: .LBB87_21: -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] -; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE2-NEXT: movq 32(%rdi), %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_22 ; SSE2-NEXT: # %bb.23: -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: cvtsi2ss %rax, %xmm2 +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 ; SSE2-NEXT: jmp .LBB87_24 ; SSE2-NEXT: .LBB87_22: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: cvtsi2ss %rax, %xmm2 -; SSE2-NEXT: addss %xmm2, %xmm2 +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 +; SSE2-NEXT: addss %xmm1, %xmm1 ; SSE2-NEXT: .LBB87_24: ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] ; SSE2-NEXT: retq ; ; SSE41-LABEL: uitofp_load_8i64_to_8f32: |