97 files changed, 10221 insertions, 4114 deletions
diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-memop-scalar-32.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-memop-scalar-32.mir
index ba72c4f..bbb09c6 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/legalize-memop-scalar-32.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-memop-scalar-32.mir
@@ -10,18 +10,18 @@ body:             |
   bb.0:
     ; X32-LABEL: name: test_memop_s8tos32
     ; X32: [[DEF:%[0-9]+]]:_(p0) = IMPLICIT_DEF
-    ; X32: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p0) :: (load (s1))
-    ; X32: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p0) :: (load (s8))
-    ; X32: [[LOAD2:%[0-9]+]]:_(s16) = G_LOAD [[DEF]](p0) :: (load (s16))
-    ; X32: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p0) :: (load (s32))
-    ; X32: [[LOAD4:%[0-9]+]]:_(p0) = G_LOAD [[DEF]](p0) :: (load (p0))
-    ; X32: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
-    ; X32: [[AND:%[0-9]+]]:_(s8) = G_AND [[LOAD]], [[C]]
-    ; X32: G_STORE [[AND]](s8), [[DEF]](p0) :: (store (s8))
-    ; X32: G_STORE [[LOAD1]](s8), [[DEF]](p0) :: (store (s8))
-    ; X32: G_STORE [[LOAD2]](s16), [[DEF]](p0) :: (store (s16))
-    ; X32: G_STORE [[LOAD3]](s32), [[DEF]](p0) :: (store (s32))
-    ; X32: G_STORE [[LOAD4]](p0), [[DEF]](p0) :: (store (p0))
+    ; X32-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p0) :: (load (s1))
+    ; X32-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p0) :: (load (s8))
+    ; X32-NEXT: [[LOAD2:%[0-9]+]]:_(s16) = G_LOAD [[DEF]](p0) :: (load (s16))
+    ; X32-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p0) :: (load (s32))
+    ; X32-NEXT: [[LOAD4:%[0-9]+]]:_(p0) = G_LOAD [[DEF]](p0) :: (load (p0))
+    ; X32-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
+    ; X32-NEXT: [[AND:%[0-9]+]]:_(s8) = G_AND [[LOAD]], [[C]]
+    ; X32-NEXT: G_STORE [[AND]](s8), [[DEF]](p0) :: (store (s8))
+    ; X32-NEXT: G_STORE [[LOAD1]](s8), [[DEF]](p0) :: (store (s8))
+    ; X32-NEXT: G_STORE [[LOAD2]](s16), [[DEF]](p0) :: (store (s16))
+    ; X32-NEXT: G_STORE [[LOAD3]](s32), [[DEF]](p0) :: (store (s32))
+    ; X32-NEXT: G_STORE [[LOAD4]](p0), [[DEF]](p0) :: (store (p0))
     %0:_(p0) = IMPLICIT_DEF
     %9:_(s1) = G_LOAD %0 :: (load (s1))
     %1:_(s8) = G_LOAD %0 :: (load (s8))
@@ -46,13 +46,13 @@ body:             |
 
     ; X32-LABEL: name: test_memop_s64
     ; X32: [[DEF:%[0-9]+]]:_(p0) = IMPLICIT_DEF
-    ; X32: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p0) :: (load (s32), align 8)
-    ; X32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; X32: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[DEF]], [[C]](s32)
-    ; X32: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4)
-    ; X32: G_STORE [[LOAD]](s32), [[DEF]](p0) :: (store (s32), align 8)
-    ; X32: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[DEF]], [[C]](s32)
-    ; X32: G_STORE [[LOAD1]](s32), [[PTR_ADD1]](p0) :: (store (s32) into unknown-address + 4)
+    ; X32-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p0) :: (load (s32), align 8)
+    ; X32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; X32-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = nuw inbounds G_PTR_ADD [[DEF]], [[C]](s32)
+    ; X32-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4)
+    ; X32-NEXT: G_STORE [[LOAD]](s32), [[DEF]](p0) :: (store (s32), align 8)
+    ; X32-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = nuw inbounds G_PTR_ADD [[DEF]], [[C]](s32)
+    ; X32-NEXT: G_STORE [[LOAD1]](s32), [[PTR_ADD1]](p0) :: (store (s32) into unknown-address + 4)
     %0:_(p0) = IMPLICIT_DEF
     %1:_(s64) = G_LOAD %0 :: (load (s64))
 
diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-undef.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-undef.mir
index 8711d84..b16fe3e 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/legalize-undef.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-undef.mir
@@ -21,6 +21,7 @@ body: |
     ; X64-NEXT: G_STORE [[DEF3]](s32), [[DEF]](p0) :: (store (s32))
     ; X64-NEXT: [[DEF4:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; X64-NEXT: G_STORE [[DEF4]](s64), [[DEF]](p0) :: (store (s64))
+    ;
     ; X32-LABEL: name: test_implicit_def
     ; X32: [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF
     ; X32-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0
@@ -35,7 +36,7 @@ body: |
     ; X32-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF4]](s64)
     ; X32-NEXT: G_STORE [[UV]](s32), [[DEF]](p0) :: (store (s32), align 8)
     ; X32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; X32-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[DEF]], [[C1]](s32)
+    ; X32-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = nuw inbounds G_PTR_ADD [[DEF]], [[C1]](s32)
     ; X32-NEXT: G_STORE [[UV1]](s32), [[PTR_ADD]](p0) :: (store (s32) into unknown-address + 4)
     %5:_(p0) = G_IMPLICIT_DEF
     %0:_(s1) = G_IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/X86/GlobalISel/regbankselect-x87.ll b/llvm/test/CodeGen/X86/GlobalISel/regbankselect-x87.ll
index 99d458a..83c319b 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/regbankselect-x87.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/regbankselect-x87.ll
@@ -164,12 +164,12 @@ define void @f5(ptr %a, ptr %b) {
   ; X86-NEXT:   [[LOAD1:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX1]](p0) :: (invariant load (p0) from %fixed-stack.0)
   ; X86-NEXT:   [[LOAD2:%[0-9]+]]:gpr(s32) = G_LOAD [[LOAD]](p0) :: (load (s32) from %ir.a, align 8)
   ; X86-NEXT:   [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 4
-  ; X86-NEXT:   [[PTR_ADD:%[0-9]+]]:gpr(p0) = G_PTR_ADD [[LOAD]], [[C]](s32)
+  ; X86-NEXT:   [[PTR_ADD:%[0-9]+]]:gpr(p0) = nuw inbounds G_PTR_ADD [[LOAD]], [[C]](s32)
   ; X86-NEXT:   [[COPY:%[0-9]+]]:gpr(p0) = COPY [[PTR_ADD]](p0)
   ; X86-NEXT:   [[LOAD3:%[0-9]+]]:gpr(s32) = G_LOAD [[COPY]](p0) :: (load (s32) from %ir.a + 4, basealign 8)
   ; X86-NEXT:   [[MV:%[0-9]+]]:gpr(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32)
   ; X86-NEXT:   [[LOAD4:%[0-9]+]]:gpr(s32) = G_LOAD [[LOAD1]](p0) :: (load (s32) from %ir.b, align 8)
-  ; X86-NEXT:   [[PTR_ADD1:%[0-9]+]]:gpr(p0) = G_PTR_ADD [[LOAD1]], [[C]](s32)
+  ; X86-NEXT:   [[PTR_ADD1:%[0-9]+]]:gpr(p0) = nuw inbounds G_PTR_ADD [[LOAD1]], [[C]](s32)
   ; X86-NEXT:   [[LOAD5:%[0-9]+]]:gpr(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s32) from %ir.b + 4, basealign 8)
   ; X86-NEXT:   [[MV1:%[0-9]+]]:gpr(s64) = G_MERGE_VALUES [[LOAD4]](s32), [[LOAD5]](s32)
   ; X86-NEXT:   [[COPY1:%[0-9]+]]:psr(s64) = COPY [[MV]](s64)
diff --git a/llvm/test/CodeGen/X86/GlobalISel/x86_64-irtranslator-struct-return.ll b/llvm/test/CodeGen/X86/GlobalISel/x86_64-irtranslator-struct-return.ll
index 171ccb2..2f1f8bc 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/x86_64-irtranslator-struct-return.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/x86_64-irtranslator-struct-return.ll
@@ -77,12 +77,12 @@ define { double, double } @test_return_d2(double %d.coerce0, double %d.coerce1)
   ; ALL-NEXT:   [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.1.d
   ; ALL-NEXT:   G_STORE [[COPY]](s64), [[FRAME_INDEX1]](p0) :: (store (s64) into %ir.1)
   ; ALL-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-  ; ALL-NEXT:   %5:_(p0) = nuw nusw G_PTR_ADD [[FRAME_INDEX1]], [[C1]](s64)
-  ; ALL-NEXT:   G_STORE [[COPY1]](s64), %5(p0) :: (store (s64) into %ir.2)
+  ; ALL-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = nuw nusw inbounds G_PTR_ADD [[FRAME_INDEX1]], [[C1]](s64)
+  ; ALL-NEXT:   G_STORE [[COPY1]](s64), [[PTR_ADD]](p0) :: (store (s64) into %ir.2)
   ; ALL-NEXT:   G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store (s8) into %ir.3, align 8), (load (s8) from %ir.4, align 8)
   ; ALL-NEXT:   [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX]](p0) :: (dereferenceable load (s64) from %ir.5)
-  ; ALL-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C1]](s64)
-  ; ALL-NEXT:   [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (dereferenceable load (s64) from %ir.5 + 8)
+  ; ALL-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p0) = nuw inbounds G_PTR_ADD [[FRAME_INDEX]], [[C1]](s64)
+  ; ALL-NEXT:   [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p0) :: (dereferenceable load (s64) from %ir.5 + 8)
   ; ALL-NEXT:   $xmm0 = COPY [[LOAD]](s64)
   ; ALL-NEXT:   $xmm1 = COPY [[LOAD1]](s64)
   ; ALL-NEXT:   RET 0, implicit $xmm0, implicit $xmm1
@@ -170,14 +170,14 @@ define { i64, i32 } @test_return_i3(i64 %i.coerce0, i32 %i.coerce1) {
   ; ALL-NEXT:   [[FRAME_INDEX3:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.3.tmp
   ; ALL-NEXT:   G_STORE [[COPY]](s64), [[FRAME_INDEX2]](p0) :: (store (s64) into %ir.0, align 4)
   ; ALL-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-  ; ALL-NEXT:   %7:_(p0) = nuw nusw G_PTR_ADD [[FRAME_INDEX2]], [[C1]](s64)
-  ; ALL-NEXT:   G_STORE [[COPY1]](s32), %7(p0) :: (store (s32) into %ir.1)
+  ; ALL-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = nuw nusw inbounds G_PTR_ADD [[FRAME_INDEX2]], [[C1]](s64)
+  ; ALL-NEXT:   G_STORE [[COPY1]](s32), [[PTR_ADD]](p0) :: (store (s32) into %ir.1)
   ; ALL-NEXT:   G_MEMCPY [[FRAME_INDEX1]](p0), [[FRAME_INDEX2]](p0), [[C]](s64), 0 :: (store (s8) into %ir.2, align 4), (load (s8) from %ir.3, align 4)
   ; ALL-NEXT:   G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store (s8) into %ir.4, align 4), (load (s8) from %ir.5, align 4)
   ; ALL-NEXT:   G_MEMCPY [[FRAME_INDEX3]](p0), [[FRAME_INDEX]](p0), [[C]](s64), 0 :: (store (s8) into %ir.6, align 8), (load (s8) from %ir.7, align 4)
   ; ALL-NEXT:   [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX3]](p0) :: (dereferenceable load (s64) from %ir.tmp)
-  ; ALL-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX3]], [[C1]](s64)
-  ; ALL-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (dereferenceable load (s32) from %ir.tmp + 8, align 8)
+  ; ALL-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p0) = nuw inbounds G_PTR_ADD [[FRAME_INDEX3]], [[C1]](s64)
+  ; ALL-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (dereferenceable load (s32) from %ir.tmp + 8, align 8)
   ; ALL-NEXT:   $rax = COPY [[LOAD]](s64)
   ; ALL-NEXT:   $edx = COPY [[LOAD1]](s32)
   ; ALL-NEXT:   RET 0, implicit $rax, implicit $edx
@@ -215,12 +215,12 @@ define { i64, i64 } @test_return_i4(i64 %i.coerce0, i64 %i.coerce1) {
   ; ALL-NEXT:   [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.1.i
   ; ALL-NEXT:   G_STORE [[COPY]](s64), [[FRAME_INDEX1]](p0) :: (store (s64) into %ir.1, align 4)
   ; ALL-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-  ; ALL-NEXT:   %5:_(p0) = nuw nusw G_PTR_ADD [[FRAME_INDEX1]], [[C1]](s64)
-  ; ALL-NEXT:   G_STORE [[COPY1]](s64), %5(p0) :: (store (s64) into %ir.2, align 4)
+  ; ALL-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = nuw nusw inbounds G_PTR_ADD [[FRAME_INDEX1]], [[C1]](s64)
+  ; ALL-NEXT:   G_STORE [[COPY1]](s64), [[PTR_ADD]](p0) :: (store (s64) into %ir.2, align 4)
   ; ALL-NEXT:   G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store (s8) into %ir.3, align 4), (load (s8) from %ir.4, align 4)
   ; ALL-NEXT:   [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX]](p0) :: (dereferenceable load (s64) from %ir.5, align 4)
-  ; ALL-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C1]](s64)
-  ; ALL-NEXT:   [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (dereferenceable load (s64) from %ir.5 + 8, align 4)
+  ; ALL-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p0) = nuw inbounds G_PTR_ADD [[FRAME_INDEX]], [[C1]](s64)
+  ; ALL-NEXT:   [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p0) :: (dereferenceable load (s64) from %ir.5 + 8, align 4)
   ; ALL-NEXT:   $rax = COPY [[LOAD]](s64)
   ; ALL-NEXT:   $rdx = COPY [[LOAD1]](s64)
   ; ALL-NEXT:   RET 0, implicit $rax, implicit $rdx
diff --git a/llvm/test/CodeGen/X86/abds-neg.ll b/llvm/test/CodeGen/X86/abds-neg.ll
index f6d66ab..d9064c6 100644
--- a/llvm/test/CodeGen/X86/abds-neg.ll
+++ b/llvm/test/CodeGen/X86/abds-neg.ll
@@ -367,44 +367,49 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl %edi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %esi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll %eax, %esi
-; X86-NEXT:    cmovll %ebx, %edi
-; X86-NEXT:    cmovll %ebp, %edx
-; X86-NEXT:    cmovll (%esp), %ecx # 4-byte Folded Reload
-; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %eax
+; X86-NEXT:    sbbl 32(%ebp), %edx
+; X86-NEXT:    sbbl 36(%ebp), %esi
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %ebx, %edx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:    negl %ecx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    movl %ebx, 4(%edx)
+; X86-NEXT:    movl %eax, 8(%edx)
+; X86-NEXT:    movl %edi, 12(%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -438,44 +443,49 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl %edi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %esi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll %eax, %esi
-; X86-NEXT:    cmovll %ebx, %edi
-; X86-NEXT:    cmovll %ebp, %edx
-; X86-NEXT:    cmovll (%esp), %ecx # 4-byte Folded Reload
-; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %eax
+; X86-NEXT:    sbbl 32(%ebp), %edx
+; X86-NEXT:    sbbl 36(%ebp), %esi
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %ebx, %edx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:    negl %ecx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    movl %ebx, 4(%edx)
+; X86-NEXT:    movl %eax, 8(%edx)
+; X86-NEXT:    movl %edi, 12(%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -639,55 +649,59 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_minmax_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %eax, %esi
-; X86-NEXT:    sbbl %ebx, %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl %ebp, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl %edi, %ecx
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    cmovll %edx, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    cmovll %esi, %edx
-; X86-NEXT:    cmpl %esi, %eax
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    subl %eax, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %ebp, 8(%eax)
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %esi
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    cmpl %esi, %edi
+; X86-NEXT:    sbbl 44(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    cmovll 32(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    cmovll 28(%ebp), %eax
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    cmovll %edi, %ecx
+; X86-NEXT:    cmpl %edi, %esi
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    sbbl 28(%ebp), %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sbbl 32(%ebp), %edi
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    sbbl 36(%ebp), %edi
+; X86-NEXT:    cmovll 36(%ebp), %ebx
+; X86-NEXT:    cmovll 32(%ebp), %edx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    cmovll 28(%ebp), %edi
+; X86-NEXT:    cmovll 24(%ebp), %esi
+; X86-NEXT:    subl %esi, %ecx
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    movl %eax, 4(%edx)
+; X86-NEXT:    movl %edi, 8(%edx)
+; X86-NEXT:    movl %esi, 12(%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -848,37 +862,41 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_cmp_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovgel (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovgel %ebx, %esi
-; X86-NEXT:    cmovgel %ebp, %ecx
-; X86-NEXT:    cmovgel %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 48(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 36(%ebp), %ebx
+; X86-NEXT:    movl 52(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 40(%ebp), %ecx
+; X86-NEXT:    sbbl 44(%ebp), %edx
+; X86-NEXT:    sbbl 48(%ebp), %esi
+; X86-NEXT:    sbbl 52(%ebp), %ebx
+; X86-NEXT:    cmovgel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovgel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovgel %edi, %esi
+; X86-NEXT:    cmovgel %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1058,15 +1076,15 @@ define i64 @abd_subnsw_i64(i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %ecx
 ; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    xorl %edx, %ecx
 ; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    subl %esi, %eax
-; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    sbbl %esi, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -1089,15 +1107,15 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %ecx
 ; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    xorl %edx, %ecx
 ; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    subl %esi, %eax
-; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    sbbl %esi, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -1118,35 +1136,39 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_subnsw_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    subl %edi, %ebp
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    sbbl %ecx, %ebx
-; X86-NEXT:    movl %ebp, (%eax)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %edx
+; X86-NEXT:    sbbl 48(%ebp), %ecx
+; X86-NEXT:    sbbl 52(%ebp), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    xorl %esi, %eax
+; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    xorl %esi, %edx
+; X86-NEXT:    xorl %esi, %edi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    subl %edi, %ebx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, (%eax)
 ; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1175,35 +1197,39 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_subnsw_i128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    subl %edi, %ebp
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    sbbl %ecx, %ebx
-; X86-NEXT:    movl %ebp, (%eax)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %edx
+; X86-NEXT:    sbbl 48(%ebp), %ecx
+; X86-NEXT:    sbbl 52(%ebp), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    xorl %esi, %eax
+; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    xorl %esi, %edx
+; X86-NEXT:    xorl %esi, %edi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    subl %edi, %ebx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, (%eax)
 ; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/abds.ll b/llvm/test/CodeGen/X86/abds.ll
index 0356c27..a1a4ba8 100644
--- a/llvm/test/CodeGen/X86/abds.ll
+++ b/llvm/test/CodeGen/X86/abds.ll
@@ -343,37 +343,41 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %ebx, %esi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    cmovll %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    sbbl 36(%ebp), %ebx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -404,37 +408,41 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %ebx, %esi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    cmovll %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    sbbl 36(%ebp), %ebx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -585,37 +593,41 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_minmax_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %ebx, %esi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    cmovll %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    sbbl 36(%ebp), %ebx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -768,37 +780,41 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_cmp_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %ebx, %esi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    cmovll %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    sbbl 36(%ebp), %ebx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1027,35 +1043,38 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind {
 define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_subnsw_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_subnsw_i128:
@@ -1079,35 +1098,38 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_subnsw_i128_undef:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_subnsw_i128_undef:
@@ -1282,37 +1304,41 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_select_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %ebx, %esi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    cmovll %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    sbbl 36(%ebp), %ebx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/abdu-neg.ll b/llvm/test/CodeGen/X86/abdu-neg.ll
index 6bda99c..b7c3407 100644
--- a/llvm/test/CodeGen/X86/abdu-neg.ll
+++ b/llvm/test/CodeGen/X86/abdu-neg.ll
@@ -355,39 +355,43 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebp, %ebp
-; X86-NEXT:    xorl %ebp, %ecx
-; X86-NEXT:    xorl %ebp, %esi
-; X86-NEXT:    xorl %ebp, %ebx
-; X86-NEXT:    xorl %ebp, %edx
-; X86-NEXT:    subl %ebp, %edx
-; X86-NEXT:    sbbl %ebp, %ebx
-; X86-NEXT:    sbbl %ebp, %esi
-; X86-NEXT:    sbbl %ebp, %ecx
-; X86-NEXT:    negl %edx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebx, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    subl 40(%ebp), %ecx
+; X86-NEXT:    sbbl 44(%ebp), %edi
+; X86-NEXT:    sbbl 48(%ebp), %esi
+; X86-NEXT:    sbbl 52(%ebp), %eax
 ; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    xorl %ebx, %eax
+; X86-NEXT:    xorl %ebx, %esi
+; X86-NEXT:    xorl %ebx, %edi
+; X86-NEXT:    xorl %ebx, %ecx
+; X86-NEXT:    subl %ebx, %ecx
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -423,39 +427,43 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebp, %ebp
-; X86-NEXT:    xorl %ebp, %ecx
-; X86-NEXT:    xorl %ebp, %esi
-; X86-NEXT:    xorl %ebp, %ebx
-; X86-NEXT:    xorl %ebp, %edx
-; X86-NEXT:    subl %ebp, %edx
-; X86-NEXT:    sbbl %ebp, %ebx
-; X86-NEXT:    sbbl %ebp, %esi
-; X86-NEXT:    sbbl %ebp, %ecx
-; X86-NEXT:    negl %edx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebx, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    subl 40(%ebp), %ecx
+; X86-NEXT:    sbbl 44(%ebp), %edi
+; X86-NEXT:    sbbl 48(%ebp), %esi
+; X86-NEXT:    sbbl 52(%ebp), %eax
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    xorl %ebx, %eax
+; X86-NEXT:    xorl %ebx, %esi
+; X86-NEXT:    xorl %ebx, %edi
+; X86-NEXT:    xorl %ebx, %ecx
+; X86-NEXT:    subl %ebx, %ecx
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    negl %ecx
 ; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -621,55 +629,59 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_minmax_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %eax, %esi
-; X86-NEXT:    sbbl %ebx, %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl %ebp, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl %edi, %ecx
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    cmovbl %edx, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    cmovbl %esi, %edx
-; X86-NEXT:    cmpl %esi, %eax
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    subl %eax, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %ebp, 8(%eax)
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %esi
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    cmpl %esi, %edi
+; X86-NEXT:    sbbl 44(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    cmovbl 32(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    cmovbl 28(%ebp), %eax
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    cmovbl %edi, %ecx
+; X86-NEXT:    cmpl %edi, %esi
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    sbbl 28(%ebp), %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sbbl 32(%ebp), %edi
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    sbbl 36(%ebp), %edi
+; X86-NEXT:    cmovbl 36(%ebp), %ebx
+; X86-NEXT:    cmovbl 32(%ebp), %edx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    cmovbl 28(%ebp), %edi
+; X86-NEXT:    cmovbl 24(%ebp), %esi
+; X86-NEXT:    subl %esi, %ecx
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    movl %eax, 4(%edx)
+; X86-NEXT:    movl %edi, 8(%edx)
+; X86-NEXT:    movl %esi, 12(%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -827,39 +839,43 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_cmp_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebp, %ebp
-; X86-NEXT:    xorl %ebp, %ecx
-; X86-NEXT:    xorl %ebp, %esi
-; X86-NEXT:    xorl %ebp, %ebx
-; X86-NEXT:    xorl %ebp, %edx
-; X86-NEXT:    subl %ebp, %edx
-; X86-NEXT:    sbbl %ebp, %ebx
-; X86-NEXT:    sbbl %ebp, %esi
-; X86-NEXT:    sbbl %ebp, %ecx
-; X86-NEXT:    negl %edx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebx, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    subl 40(%ebp), %ecx
+; X86-NEXT:    sbbl 44(%ebp), %edi
+; X86-NEXT:    sbbl 48(%ebp), %esi
+; X86-NEXT:    sbbl 52(%ebp), %eax
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    xorl %ebx, %eax
+; X86-NEXT:    xorl %ebx, %esi
+; X86-NEXT:    xorl %ebx, %edi
+; X86-NEXT:    xorl %ebx, %ecx
+; X86-NEXT:    subl %ebx, %ecx
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    negl %ecx
 ; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/abdu.ll b/llvm/test/CodeGen/X86/abdu.ll
index 27acec3..043c915 100644
--- a/llvm/test/CodeGen/X86/abdu.ll
+++ b/llvm/test/CodeGen/X86/abdu.ll
@@ -326,35 +326,38 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind {
 define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_ext_i128:
@@ -381,35 +384,38 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128_undef:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_ext_i128_undef:
@@ -548,35 +554,38 @@ define i64 @abd_minmax_i64(i64 %a, i64 %b) nounwind {
 define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_minmax_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_minmax_i128:
@@ -717,35 +726,38 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_cmp_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_cmp_i128:
@@ -887,35 +899,38 @@ define i64 @abd_select_i64(i64 %a, i64 %b) nounwind {
 define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_select_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_select_i128:
diff --git a/llvm/test/CodeGen/X86/abs.ll b/llvm/test/CodeGen/X86/abs.ll
index bae140a..e252d59 100644
--- a/llvm/test/CodeGen/X86/abs.ll
+++ b/llvm/test/CodeGen/X86/abs.ll
@@ -144,31 +144,34 @@ define i128 @test_i128(i128 %a) nounwind {
 ;
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    subl %edx, %ebx
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl %ebx, (%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %r = call i128 @llvm.abs.i128(i128 %a, i1 false)
   ret i128 %r
@@ -688,13 +691,17 @@ define i128 @test_sextinreg_i128(i128 %a) nounwind {
 ;
 ; X86-LABEL: test_sextinreg_i128:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl 24(%ebp), %esi
 ; X86-NEXT:    xorl %edx, %esi
 ; X86-NEXT:    subl %edx, %esi
 ; X86-NEXT:    sbbl %edx, %ecx
@@ -702,7 +709,9 @@ define i128 @test_sextinreg_i128(i128 %a) nounwind {
 ; X86-NEXT:    movl %ecx, 4(%eax)
 ; X86-NEXT:    movl $0, 12(%eax)
 ; X86-NEXT:    movl $0, 8(%eax)
+; X86-NEXT:    leal -4(%ebp), %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %shl = shl i128 %a, 64
   %ashr = ashr exact i128 %shl, 64
diff --git a/llvm/test/CodeGen/X86/add-sub-bool.ll b/llvm/test/CodeGen/X86/add-sub-bool.ll
index c2bfcf5..1df284f 100644
--- a/llvm/test/CodeGen/X86/add-sub-bool.ll
+++ b/llvm/test/CodeGen/X86/add-sub-bool.ll
@@ -104,18 +104,21 @@ define i24 @test_i24_add_add_idx(i24 %x, i24 %y, i24 %z) nounwind {
 define i128 @test_i128_add_add_idx(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-LABEL: test_i128_add_add_idx:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    btl $5, {{[0-9]+}}(%esp)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    movl 40(%ebp), %esi
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    addl 24(%ebp), %esi
+; X86-NEXT:    adcl 28(%ebp), %edi
+; X86-NEXT:    adcl 32(%ebp), %ecx
+; X86-NEXT:    adcl 36(%ebp), %edx
+; X86-NEXT:    btl $5, 64(%ebp)
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %ecx
@@ -124,8 +127,10 @@ define i128 @test_i128_add_add_idx(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test_i128_add_add_idx:
diff --git a/llvm/test/CodeGen/X86/apx/cf.ll b/llvm/test/CodeGen/X86/apx/cf.ll
index 1e4ac3f..b111ae5 100644
--- a/llvm/test/CodeGen/X86/apx/cf.ll
+++ b/llvm/test/CodeGen/X86/apx/cf.ll
@@ -162,7 +162,7 @@ entry:
 define void @load_zext(i1 %cond, ptr %b, ptr %p) {
 ; CHECK-LABEL: load_zext:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andb $1, %dil
+; CHECK-NEXT:    testb $1, %dil
 ; CHECK-NEXT:    cfcmovnew (%rsi), %ax
 ; CHECK-NEXT:    movzwl %ax, %eax
 ; CHECK-NEXT:    cfcmovnel %eax, (%rdx)
@@ -180,7 +180,7 @@ entry:
 define void @load_sext(i1 %cond, ptr %b, ptr %p) {
 ; CHECK-LABEL: load_sext:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andb $1, %dil
+; CHECK-NEXT:    testb $1, %dil
 ; CHECK-NEXT:    cfcmovnel (%rsi), %eax
 ; CHECK-NEXT:    cltq
 ; CHECK-NEXT:    cfcmovneq %rax, (%rdx)
diff --git a/llvm/test/CodeGen/X86/arg-copy-elide.ll b/llvm/test/CodeGen/X86/arg-copy-elide.ll
index 0eb2c63..f13627b 100644
--- a/llvm/test/CodeGen/X86/arg-copy-elide.ll
+++ b/llvm/test/CodeGen/X86/arg-copy-elide.ll
@@ -188,11 +188,11 @@ define void @split_i128(ptr %sret, i128 %x) {
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    andl $-16, %esp
 ; CHECK-NEXT:    subl $48, %esp
-; CHECK-NEXT:    movl 12(%ebp), %eax
+; CHECK-NEXT:    movl 24(%ebp), %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl 16(%ebp), %ebx
-; CHECK-NEXT:    movl 20(%ebp), %esi
-; CHECK-NEXT:    movl 24(%ebp), %edi
+; CHECK-NEXT:    movl 28(%ebp), %ebx
+; CHECK-NEXT:    movl 32(%ebp), %esi
+; CHECK-NEXT:    movl 36(%ebp), %edi
 ; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll
index 217cceb..0de308a 100644
--- a/llvm/test/CodeGen/X86/avg.ll
+++ b/llvm/test/CodeGen/X86/avg.ll
@@ -1734,20 +1734,20 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
 ; SSE2-LABEL: not_avg_v16i8_wide_constants:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movaps (%rdi), %xmm1
-; SSE2-NEXT:    movdqa (%rsi), %xmm2
+; SSE2-NEXT:    movdqa (%rsi), %xmm0
 ; SSE2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
-; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm2
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
 ; SSE2-NEXT:    movd %eax, %xmm1
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
-; SSE2-NEXT:    movd %eax, %xmm4
+; SSE2-NEXT:    movd %eax, %xmm3
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
-; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm4
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
 ; SSE2-NEXT:    movd %eax, %xmm5
@@ -1762,6 +1762,9 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
 ; SSE2-NEXT:    movd %eax, %xmm8
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
+; SSE2-NEXT:    movd %eax, %xmm10
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    decl %eax
 ; SSE2-NEXT:    movd %eax, %xmm9
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
@@ -1771,9 +1774,6 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
 ; SSE2-NEXT:    movd %eax, %xmm12
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
-; SSE2-NEXT:    movd %eax, %xmm10
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    decl %eax
 ; SSE2-NEXT:    movd %eax, %xmm13
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
@@ -1783,43 +1783,45 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
 ; SSE2-NEXT:    movd %eax, %xmm15
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,0,0,0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[0,0,0,0]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm8[0,0,0,0]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
 ; SSE2-NEXT:    movapd %xmm4, %xmm5
 ; SSE2-NEXT:    andpd %xmm1, %xmm5
 ; SSE2-NEXT:    xorpd %xmm4, %xmm1
 ; SSE2-NEXT:    psrlw $1, %xmm1
 ; SSE2-NEXT:    paddw %xmm5, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm12[0,0,0,0]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm14[0,0,0,0]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; SSE2-NEXT:    movapd %xmm0, %xmm3
-; SSE2-NEXT:    andpd %xmm2, %xmm3
-; SSE2-NEXT:    xorpd %xmm0, %xmm2
-; SSE2-NEXT:    psrlw $1, %xmm2
-; SSE2-NEXT:    paddw %xmm3, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; SSE2-NEXT:    movapd %xmm2, %xmm3
+; SSE2-NEXT:    andpd %xmm0, %xmm3
+; SSE2-NEXT:    xorpd %xmm2, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    paddw %xmm3, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    packuswb %xmm0, %xmm1
 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
 ; SSE2-NEXT:    retq
 ;
@@ -1829,74 +1831,75 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX1-NEXT:    vpextrd $2, %xmm5, %ecx
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; AVX1-NEXT:    vpextrd $2, %xmm4, %eax
-; AVX1-NEXT:    vpextrw $3, %xmm3, %edx
+; AVX1-NEXT:    vpextrw $7, %xmm3, %edx
+; AVX1-NEXT:    vpextrw $6, %xmm3, %ecx
+; AVX1-NEXT:    vpextrw $5, %xmm3, %eax
 ; AVX1-NEXT:    decl %edx
 ; AVX1-NEXT:    vmovd %edx, %xmm4
-; AVX1-NEXT:    vpextrw $2, %xmm3, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm5
-; AVX1-NEXT:    vpextrw $1, %xmm3, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm6
-; AVX1-NEXT:    vpextrw $0, %xmm3, %edx
+; AVX1-NEXT:    vpextrw $4, %xmm3, %edx
+; AVX1-NEXT:    decl %ecx
+; AVX1-NEXT:    vmovd %ecx, %xmm5
+; AVX1-NEXT:    vpextrw $1, %xmm3, %ecx
+; AVX1-NEXT:    decl %eax
+; AVX1-NEXT:    vmovd %eax, %xmm6
+; AVX1-NEXT:    vpextrw $0, %xmm3, %eax
 ; AVX1-NEXT:    decl %edx
 ; AVX1-NEXT:    vmovd %edx, %xmm7
-; AVX1-NEXT:    vpextrw $3, %xmm2, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm8
-; AVX1-NEXT:    vpextrw $2, %xmm2, %edx
+; AVX1-NEXT:    vpextrw $3, %xmm3, %edx
+; AVX1-NEXT:    decq %rcx
+; AVX1-NEXT:    vmovq %rcx, %xmm8
+; AVX1-NEXT:    vpextrw $2, %xmm3, %ecx
+; AVX1-NEXT:    decq %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vpextrw $7, %xmm2, %eax
 ; AVX1-NEXT:    decl %edx
 ; AVX1-NEXT:    vmovd %edx, %xmm9
-; AVX1-NEXT:    vpextrw $1, %xmm2, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm10
-; AVX1-NEXT:    vpextrw $0, %xmm2, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm11
-; AVX1-NEXT:    vpextrw $5, %xmm3, %edx
+; AVX1-NEXT:    vpextrw $6, %xmm2, %edx
+; AVX1-NEXT:    decl %ecx
+; AVX1-NEXT:    vmovd %ecx, %xmm10
+; AVX1-NEXT:    vpextrw $5, %xmm2, %ecx
+; AVX1-NEXT:    decl %eax
+; AVX1-NEXT:    vmovd %eax, %xmm11
+; AVX1-NEXT:    vpextrw $4, %xmm2, %eax
 ; AVX1-NEXT:    decl %edx
 ; AVX1-NEXT:    vmovd %edx, %xmm12
-; AVX1-NEXT:    vpextrw $4, %xmm3, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm13
-; AVX1-NEXT:    vpextrw $5, %xmm2, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm14
-; AVX1-NEXT:    vpextrw $4, %xmm2, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm15
-; AVX1-NEXT:    vpextrw $7, %xmm3, %edx
+; AVX1-NEXT:    vpextrw $1, %xmm2, %edx
 ; AVX1-NEXT:    decl %ecx
-; AVX1-NEXT:    vmovd %ecx, %xmm3
-; AVX1-NEXT:    vpextrw $7, %xmm2, %ecx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm2
+; AVX1-NEXT:    vmovd %ecx, %xmm13
+; AVX1-NEXT:    vpextrw $0, %xmm2, %ecx
+; AVX1-NEXT:    decl %eax
+; AVX1-NEXT:    vmovd %eax, %xmm14
+; AVX1-NEXT:    vpextrw $3, %xmm2, %eax
+; AVX1-NEXT:    decq %rdx
+; AVX1-NEXT:    vmovq %rdx, %xmm15
+; AVX1-NEXT:    vpextrw $2, %xmm2, %edx
+; AVX1-NEXT:    decq %rcx
+; AVX1-NEXT:    vmovq %rcx, %xmm2
 ; AVX1-NEXT:    decl %eax
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
 ; AVX1-NEXT:    vmovd %eax, %xmm5
-; AVX1-NEXT:    decl %ecx
+; AVX1-NEXT:    decl %edx
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX1-NEXT:    vmovd %ecx, %xmm7
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm8, %ymm6
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX1-NEXT:    vmovddup {{.*#+}} ymm3 = ymm6[0,0,2,2]
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
-; AVX1-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
+; AVX1-NEXT:    vmovd %edx, %xmm7
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5],xmm4[6,7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,0,1,1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5],xmm4[6,7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    vandps %ymm0, %ymm2, %ymm1
 ; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
diff --git a/llvm/test/CodeGen/X86/avx512fp16-cvt.ll b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
index f66f0c0..cc58bc1 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
@@ -628,13 +628,19 @@ define half @s128_to_half(i128 %x) {
 ;
 ; X86-LABEL: s128_to_half:
 ; X86:       # %bb.0:
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-NEXT:    vmovups %xmm0, (%esp)
 ; X86-NEXT:    calll __floattihf
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
   %a = sitofp i128 %x to half
   ret half %a
@@ -713,13 +719,19 @@ define half @u128_to_half(i128 %x) {
 ;
 ; X86-LABEL: u128_to_half:
 ; X86:       # %bb.0:
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-NEXT:    vmovups %xmm0, (%esp)
 ; X86-NEXT:    calll __floatuntihf
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
   %a = uitofp i128 %x to half
   ret half %a
@@ -1020,11 +1032,15 @@ define half @f128_to_half(fp128 %x) nounwind {
 ;
 ; X86-LABEL: f128_to_half:
 ; X86:       # %bb.0:
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-NEXT:    vmovups %xmm0, (%esp)
 ; X86-NEXT:    calll __trunctfhf2
-; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
   %a = fptrunc fp128 %x to half
   ret half %a
diff --git a/llvm/test/CodeGen/X86/bitselect.ll b/llvm/test/CodeGen/X86/bitselect.ll
index 4fc0827..3338131 100644
--- a/llvm/test/CodeGen/X86/bitselect.ll
+++ b/llvm/test/CodeGen/X86/bitselect.ll
@@ -146,37 +146,40 @@ define i64 @bitselect_i64(i64 %a, i64 %b, i64 %m) nounwind {
 define i128 @bitselect_i128(i128 %a, i128 %b, i128 %m) nounwind {
 ; X86-LABEL: bitselect_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %edi, %ecx
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %edi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %esi, %ebx
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    andl 56(%ebp), %ecx
+; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    andl 60(%ebp), %esi
+; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    movl 48(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    andl 64(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    andl 68(%ebp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-NOBMI-LABEL: bitselect_i128:
diff --git a/llvm/test/CodeGen/X86/bsf.ll b/llvm/test/CodeGen/X86/bsf.ll
index 312f94c..143e10e 100644
--- a/llvm/test/CodeGen/X86/bsf.ll
+++ b/llvm/test/CodeGen/X86/bsf.ll
@@ -263,70 +263,78 @@ define i128 @cmov_bsf128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: cmov_bsf128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    orl %ebp, %edx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %ebx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    je .LBB8_1
 ; X86-NEXT:  # %bb.2: # %cond.false
 ; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    jne .LBB8_3
 ; X86-NEXT:  # %bb.4: # %cond.false
-; X86-NEXT:    rep bsfl %edi, %esi
-; X86-NEXT:    addl $32, %esi
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB8_7
-; X86-NEXT:  .LBB8_6:
-; X86-NEXT:    rep bsfl %eax, %edx
-; X86-NEXT:    jmp .LBB8_8
+; X86-NEXT:    rep bsfl %esi, %eax
+; X86-NEXT:    addl $32, %eax
+; X86-NEXT:    jmp .LBB8_5
 ; X86-NEXT:  .LBB8_1:
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    xorl %ebp, %ebp
-; X86-NEXT:    movl $128, %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    movl $128, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    jmp .LBB8_11
 ; X86-NEXT:  .LBB8_3:
-; X86-NEXT:    rep bsfl %ecx, %esi
-; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    rep bsfl %ecx, %eax
+; X86-NEXT:  .LBB8_5: # %cond.false
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    jne .LBB8_6
-; X86-NEXT:  .LBB8_7: # %cond.false
-; X86-NEXT:    rep bsfl %ebp, %edx
+; X86-NEXT:  # %bb.7: # %cond.false
+; X86-NEXT:    rep bsfl %ebx, %edx
 ; X86-NEXT:    addl $32, %edx
+; X86-NEXT:    jmp .LBB8_8
+; X86-NEXT:  .LBB8_6:
+; X86-NEXT:    rep bsfl %edi, %edx
 ; X86-NEXT:  .LBB8_8: # %cond.false
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    orl %esi, %edi
 ; X86-NEXT:    jne .LBB8_10
 ; X86-NEXT:  # %bb.9: # %cond.false
 ; X86-NEXT:    addl $64, %edx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:  .LBB8_10: # %cond.false
-; X86-NEXT:    xorl %ebp, %ebp
+; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:  .LBB8_11: # %cond.end
-; X86-NEXT:    xorl %ebx, %ebx
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    jne .LBB8_13
-; X86-NEXT:  # %bb.12:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:  .LBB8_13: # %cond.end
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edx, 12(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    orl 32(%ebp), %ecx
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    je .LBB8_12
+; X86-NEXT:  # %bb.13: # %cond.end
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    jmp .LBB8_14
+; X86-NEXT:  .LBB8_12:
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:  .LBB8_14: # %cond.end
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -361,46 +369,49 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: cmov_bsf128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    orl %ebx, %ebp
-; X86-NEXT:    orl %edi, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    je .LBB9_11
 ; X86-NEXT:  # %bb.1: # %select.true.sink
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    jne .LBB9_2
 ; X86-NEXT:  # %bb.3: # %select.true.sink
-; X86-NEXT:    rep bsfl %ecx, %edi
-; X86-NEXT:    addl $32, %edi
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    rep bsfl %ecx, %ebx
+; X86-NEXT:    addl $32, %ebx
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    je .LBB9_6
 ; X86-NEXT:  .LBB9_5:
-; X86-NEXT:    rep bsfl %ebx, %esi
+; X86-NEXT:    rep bsfl %edi, %esi
 ; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    je .LBB9_8
 ; X86-NEXT:    jmp .LBB9_9
 ; X86-NEXT:  .LBB9_11: # %select.end
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 52(%ebp), %ecx
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    movl 40(%ebp), %edi
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    jmp .LBB9_10
 ; X86-NEXT:  .LBB9_2:
-; X86-NEXT:    rep bsfl %edx, %edi
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    rep bsfl %edx, %ebx
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    jne .LBB9_5
 ; X86-NEXT:  .LBB9_6: # %select.true.sink
 ; X86-NEXT:    rep bsfl %esi, %esi
@@ -409,13 +420,14 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind {
 ; X86-NEXT:    jne .LBB9_9
 ; X86-NEXT:  .LBB9_8: # %select.true.sink
 ; X86-NEXT:    addl $64, %esi
-; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl %esi, %ebx
 ; X86-NEXT:  .LBB9_9: # %select.true.sink
-; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %ebx, (%eax)
 ; X86-NEXT:    movl $0, 12(%eax)
 ; X86-NEXT:    movl $0, 8(%eax)
 ; X86-NEXT:    movl $0, 4(%eax)
 ; X86-NEXT:  .LBB9_10: # %select.true.sink
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/bsr.ll b/llvm/test/CodeGen/X86/bsr.ll
index fbca4af..ab0478a 100644
--- a/llvm/test/CodeGen/X86/bsr.ll
+++ b/llvm/test/CodeGen/X86/bsr.ll
@@ -291,79 +291,80 @@ define i128 @cmov_bsr128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: cmov_bsr128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    orl %ebp, %edx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    orl %ebx, %esi
-; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    je .LBB8_1
 ; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    testl %ebp, %ebp
+; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    jne .LBB8_3
 ; X86-NEXT:  # %bb.4: # %cond.false
-; X86-NEXT:    bsrl %ebx, %edx
-; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    orl $32, %edx
+; X86-NEXT:    bsrl %ebx, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
 ; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    je .LBB8_7
 ; X86-NEXT:  .LBB8_6:
-; X86-NEXT:    bsrl %edi, %esi
-; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    bsrl %edi, %eax
+; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:    jmp .LBB8_8
 ; X86-NEXT:  .LBB8_1:
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl $0, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl $128, %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl $128, %esi
 ; X86-NEXT:    jmp .LBB8_11
 ; X86-NEXT:  .LBB8_3:
-; X86-NEXT:    bsrl %ebp, %edx
-; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    bsrl %esi, %esi
+; X86-NEXT:    xorl $31, %esi
 ; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    jne .LBB8_6
 ; X86-NEXT:  .LBB8_7: # %cond.false
-; X86-NEXT:    bsrl %ecx, %esi
-; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    bsrl %ecx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
 ; X86-NEXT:  .LBB8_8: # %cond.false
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    orl %ebp, %ebx
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    orl 36(%ebp), %edx
 ; X86-NEXT:    jne .LBB8_10
 ; X86-NEXT:  # %bb.9: # %cond.false
-; X86-NEXT:    orl $64, %esi
-; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    orl $64, %eax
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:  .LBB8_10: # %cond.false
-; X86-NEXT:    movl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:  .LBB8_11: # %cond.end
-; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    orl %ebp, %edi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    orl 32(%ebp), %ecx
+; X86-NEXT:    orl 36(%ebp), %edi
 ; X86-NEXT:    orl %ecx, %edi
 ; X86-NEXT:    je .LBB8_12
 ; X86-NEXT:  # %bb.13: # %cond.end
-; X86-NEXT:    xorl $127, %edx
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    xorl $127, %esi
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    jmp .LBB8_14
 ; X86-NEXT:  .LBB8_12:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    movl 48(%ebp), %ebx
+; X86-NEXT:    movl 44(%ebp), %ecx
+; X86-NEXT:    movl 40(%ebp), %esi
 ; X86-NEXT:  .LBB8_14: # %cond.end
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %ebx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -398,62 +399,67 @@ define i128 @cmov_bsr128_undef(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: cmov_bsr128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    jne .LBB9_1
 ; X86-NEXT:  # %bb.2:
-; X86-NEXT:    bsrl %esi, %ecx
-; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:    orl $32, %ecx
+; X86-NEXT:    bsrl %edi, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
 ; X86-NEXT:    jmp .LBB9_3
 ; X86-NEXT:  .LBB9_1:
-; X86-NEXT:    bsrl %edi, %ecx
-; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    bsrl %eax, %esi
+; X86-NEXT:    xorl $31, %esi
 ; X86-NEXT:  .LBB9_3:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl 24(%ebp), %ebx
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    jne .LBB9_4
 ; X86-NEXT:  # %bb.5:
-; X86-NEXT:    bsrl %ebx, %ebp
-; X86-NEXT:    xorl $31, %ebp
-; X86-NEXT:    orl $32, %ebp
-; X86-NEXT:    jmp .LBB9_6
+; X86-NEXT:    bsrl %ebx, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl $32, %ecx
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    je .LBB9_7
+; X86-NEXT:    jmp .LBB9_8
 ; X86-NEXT:  .LBB9_4:
-; X86-NEXT:    bsrl %edx, %ebp
-; X86-NEXT:    xorl $31, %ebp
-; X86-NEXT:  .LBB9_6:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    bsrl %edx, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl %eax, %edi
 ; X86-NEXT:    jne .LBB9_8
-; X86-NEXT:  # %bb.7:
-; X86-NEXT:    orl $64, %ebp
-; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:  .LBB9_7:
+; X86-NEXT:    orl $64, %ecx
+; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:  .LBB9_8:
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl 32(%ebp), %ebx
 ; X86-NEXT:    orl %edx, %ebx
 ; X86-NEXT:    jne .LBB9_9
 ; X86-NEXT:  # %bb.10:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 52(%ebp), %edi
+; X86-NEXT:    movl 40(%ebp), %esi
+; X86-NEXT:    movl 44(%ebp), %ecx
 ; X86-NEXT:    jmp .LBB9_11
 ; X86-NEXT:  .LBB9_9:
-; X86-NEXT:    xorl $127, %ecx
+; X86-NEXT:    xorl $127, %esi
+; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:  .LBB9_11:
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/bswap-wide-int.ll b/llvm/test/CodeGen/X86/bswap-wide-int.ll
index 6d5e995a..673b7f1 100644
--- a/llvm/test/CodeGen/X86/bswap-wide-int.ll
+++ b/llvm/test/CodeGen/X86/bswap-wide-int.ll
@@ -41,13 +41,16 @@ define i64 @bswap_i64(i64 %a0) nounwind {
 define i128 @bswap_i128(i128 %a0) nounwind {
 ; X86-LABEL: bswap_i128:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %edi
 ; X86-NEXT:    bswapl %edi
 ; X86-NEXT:    bswapl %esi
 ; X86-NEXT:    bswapl %edx
@@ -56,25 +59,32 @@ define i128 @bswap_i128(i128 %a0) nounwind {
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X86-MOVBE-LABEL: bswap_i128:
 ; X86-MOVBE:       # %bb.0:
+; X86-MOVBE-NEXT:    pushl %ebp
+; X86-MOVBE-NEXT:    movl %esp, %ebp
 ; X86-MOVBE-NEXT:    pushl %edi
 ; X86-MOVBE-NEXT:    pushl %esi
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-MOVBE-NEXT:    andl $-16, %esp
+; X86-MOVBE-NEXT:    movl 8(%ebp), %eax
+; X86-MOVBE-NEXT:    movl 32(%ebp), %ecx
+; X86-MOVBE-NEXT:    movl 36(%ebp), %edx
+; X86-MOVBE-NEXT:    movl 24(%ebp), %esi
+; X86-MOVBE-NEXT:    movl 28(%ebp), %edi
 ; X86-MOVBE-NEXT:    movbel %esi, 12(%eax)
 ; X86-MOVBE-NEXT:    movbel %edi, 8(%eax)
 ; X86-MOVBE-NEXT:    movbel %ecx, 4(%eax)
 ; X86-MOVBE-NEXT:    movbel %edx, (%eax)
+; X86-MOVBE-NEXT:    leal -8(%ebp), %esp
 ; X86-MOVBE-NEXT:    popl %esi
 ; X86-MOVBE-NEXT:    popl %edi
+; X86-MOVBE-NEXT:    popl %ebp
 ; X86-MOVBE-NEXT:    retl $4
 ;
 ; X64-LABEL: bswap_i128:
diff --git a/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll b/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll
index ab9fa22..24d3030 100644
--- a/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll
+++ b/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll
@@ -48,6 +48,6 @@ return:                                           ; preds = %catch, %entry
 ; CHECK-NEXT: .long   (.Llsda_end0-.Llsda_begin0)/16
 ; CHECK-NEXT: .Llsda_begin0:
 ; CHECK-NEXT: .long   .Ltmp0@IMGREL
-; CHECK-NEXT: .long   .Ltmp1@IMGREL+1
+; CHECK-NEXT: .long   .Ltmp1@IMGREL
 ; CHECK-NEXT: .long   1
 ; CHECK-NEXT: .long   .LBB0_[[catch]]@IMGREL
diff --git a/llvm/test/CodeGen/X86/combine-add-ssat.ll b/llvm/test/CodeGen/X86/combine-add-ssat.ll
index 3e21798..75adcdd 100644
--- a/llvm/test/CodeGen/X86/combine-add-ssat.ll
+++ b/llvm/test/CodeGen/X86/combine-add-ssat.ll
@@ -62,12 +62,12 @@ define <8 x i16> @combine_constfold_v8i16() {
 define <8 x i16> @combine_constfold_undef_v8i16() {
 ; SSE-LABEL: combine_constfold_undef_v8i16:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [65535,65535,65535,65534,0,65280,32768,0]
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [65535,65535,u,65534,0,65280,32768,0]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_constfold_undef_v8i16:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [65535,65535,65535,65534,0,65280,32768,0]
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [65535,65535,u,65534,0,65280,32768,0]
 ; AVX-NEXT:    retq
   %res = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> <i16 undef, i16 1, i16 undef, i16 65535, i16 -1, i16 -255, i16 -32760, i16 1>, <8 x i16> <i16 1, i16 undef, i16 undef, i16 65535, i16 1, i16 65535, i16 -10, i16 65535>)
   ret <8 x i16> %res
diff --git a/llvm/test/CodeGen/X86/combine-add-usat.ll b/llvm/test/CodeGen/X86/combine-add-usat.ll
index 13bc3b2..5b947dd 100644
--- a/llvm/test/CodeGen/X86/combine-add-usat.ll
+++ b/llvm/test/CodeGen/X86/combine-add-usat.ll
@@ -62,12 +62,13 @@ define <8 x i16> @combine_constfold_v8i16() {
 define <8 x i16> @combine_constfold_undef_v8i16() {
 ; SSE-LABEL: combine_constfold_undef_v8i16:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,2,65535]
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [65535,65535,u,65535,65535,65535,2,65535]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_constfold_undef_v8i16:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,2,65535]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = [65535,65535,2,65535,65535,65535,2,65535]
+; AVX-NEXT:    # xmm0 = mem[0,0]
 ; AVX-NEXT:    retq
   %res = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> <i16 undef, i16 1, i16 undef, i16 65535, i16 -1, i16 -255, i16 -65535, i16 1>, <8 x i16> <i16 1, i16 undef, i16 undef, i16 65535, i16 1, i16 65535, i16 1, i16 65535>)
   ret <8 x i16> %res
diff --git a/llvm/test/CodeGen/X86/combine-sub-ssat.ll b/llvm/test/CodeGen/X86/combine-sub-ssat.ll
index 979331f..0dab025 100644
--- a/llvm/test/CodeGen/X86/combine-sub-ssat.ll
+++ b/llvm/test/CodeGen/X86/combine-sub-ssat.ll
@@ -62,12 +62,12 @@ define <8 x i16> @combine_constfold_v8i16() {
 define <8 x i16> @combine_constfold_undef_v8i16() {
 ; SSE-LABEL: combine_constfold_undef_v8i16:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,0,0,0,65534,65282,32786,2]
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,0,u,0,65534,65282,32786,2]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_constfold_undef_v8i16:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,0,0,65534,65282,32786,2]
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,u,0,65534,65282,32786,2]
 ; AVX-NEXT:    retq
   %res = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> <i16 undef, i16 1, i16 undef, i16 65535, i16 -1, i16 -255, i16 -32760, i16 1>, <8 x i16> <i16 1, i16 undef, i16 undef, i16 65535, i16 1, i16 65535, i16 -10, i16 65535>)
   ret <8 x i16> %res
diff --git a/llvm/test/CodeGen/X86/combine-sub-usat.ll b/llvm/test/CodeGen/X86/combine-sub-usat.ll
index b70e3fc..36e374b 100644
--- a/llvm/test/CodeGen/X86/combine-sub-usat.ll
+++ b/llvm/test/CodeGen/X86/combine-sub-usat.ll
@@ -73,17 +73,17 @@ define <8 x i16> @combine_constfold_v8i16() {
 define <8 x i16> @combine_constfold_undef_v8i16() {
 ; SSE-LABEL: combine_constfold_undef_v8i16:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,0,0,0,65534,0,0,0]
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,0,u,0,65534,0,0,0]
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: combine_constfold_undef_v8i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,0,0,65534,0,0,0]
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,u,0,65534,0,0,0]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: combine_constfold_undef_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,0,0,65534,0,0,0]
+; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,u,0,65534,0,0,0]
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: combine_constfold_undef_v8i16:
diff --git a/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll b/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll
index c4c194e..7855ff2 100644
--- a/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll
+++ b/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll
@@ -121,7 +121,6 @@ define void @f_non_leaf(i32 %x, i32 %y) !prof !14 {
 ; WIN64-NEXT:    # encoding: [0xeb,A]
 ; WIN64-NEXT:    # fixup A - offset: 1, value: foo, kind: FK_PCRel_1
 ; WIN64-NEXT:  .LBB1_2: # %bb2
-; WIN64-NEXT:    nop # encoding: [0x90]
 ; WIN64-NEXT:    .seh_startepilogue
 ; WIN64-NEXT:    popq %rbx # encoding: [0x5b]
 ; WIN64-NEXT:    .seh_endepilogue
diff --git a/llvm/test/CodeGen/X86/conditional-tailcall.ll b/llvm/test/CodeGen/X86/conditional-tailcall.ll
index 9c1d830..2859a87 100644
--- a/llvm/test/CodeGen/X86/conditional-tailcall.ll
+++ b/llvm/test/CodeGen/X86/conditional-tailcall.ll
@@ -121,7 +121,6 @@ define void @f_non_leaf(i32 %x, i32 %y) optsize {
 ; WIN64-NEXT:    # encoding: [0xeb,A]
 ; WIN64-NEXT:    # fixup A - offset: 1, value: foo, kind: FK_PCRel_1
 ; WIN64-NEXT:  .LBB1_2: # %bb2
-; WIN64-NEXT:    nop # encoding: [0x90]
 ; WIN64-NEXT:    .seh_startepilogue
 ; WIN64-NEXT:    popq %rbx # encoding: [0x5b]
 ; WIN64-NEXT:    .seh_endepilogue
diff --git a/llvm/test/CodeGen/X86/constant-pool-partition.ll b/llvm/test/CodeGen/X86/constant-pool-partition.ll
index 515284f..e42b41b 100644
--- a/llvm/test/CodeGen/X86/constant-pool-partition.ll
+++ b/llvm/test/CodeGen/X86/constant-pool-partition.ll
@@ -24,11 +24,11 @@ target triple = "x86_64-grtev4-linux-gnu"
 ; RUN:     %s -o - 2>&1 | FileCheck %s --dump-input=always
 
 ;; For function @cold_func
-; CHECK:       .section	.rodata.cst8.hot,"aM",@progbits,8
+; CHECK:       .section	.rodata.cst8.hot.,"aM",@progbits,8
 ; CHECK-NEXT:      .p2align
 ; CHECK-NEXT:    .LCPI0_0:
 ; CHECK-NEXT:	     .quad	0x3fe5c28f5c28f5c3              # double 0.68000000000000005
-; CHECK-NEXT:  .section	.rodata.cst8.unlikely,"aM",@progbits,8
+; CHECK-NEXT:  .section	.rodata.cst8.unlikely.,"aM",@progbits,8
 ; CHECK-NEXT:      .p2align
 ; CHECK-NEXT:    .LCPI0_1:
 ; CHECK-NEXT:	     .quad	0x3eb0000000000000              # double 9.5367431640625E-7
@@ -50,11 +50,11 @@ target triple = "x86_64-grtev4-linux-gnu"
 ; CHECK-NEXT:     .long   0x3e000000              # float 0.125
 
 ;; For function @hot_func
-; CHECK:	     .section	.rodata.cst8.hot,"aM",@progbits,8
+; CHECK:	     .section	.rodata.cst8.hot.,"aM",@progbits,8
 ; CHECK-NEXT:      .p2align
 ; CHECK-NEXT:    .LCPI3_0:
 ; CHECK-NEXT:     .quad	0x3fe5c28f5c28f5c3              # double 0.68000000000000005
-; CHECK-NEXT:  .section        .rodata.cst16.hot,"aM",@progbits,16
+; CHECK-NEXT:  .section        .rodata.cst16.hot.,"aM",@progbits,16
 ; CHECK-NEXT:      .p2align
 ; CHECK-NEXT:    .LCPI3_1:
 ; CHECK-NEXT:      .long   2147483648                      # 0x80000000
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index d869f8e..455b72d 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -152,17 +152,17 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $176, %esp
-; X86-NEXT:    movl 20(%ebp), %edx
-; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarl $31, %eax
 ; X86-NEXT:    xorl %eax, %ecx
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    xorl %eax, %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl 16(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %edx
 ; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %ecx
 ; X86-NEXT:    xorl %eax, %ecx
 ; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -172,16 +172,15 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 48(%ebp), %ecx
 ; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    movl 44(%ebp), %ebx
 ; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    movl 40(%ebp), %edi
 ; X86-NEXT:    xorl %edx, %edi
 ; X86-NEXT:    subl %edx, %edi
 ; X86-NEXT:    sbbl %edx, %ebx
@@ -204,45 +203,45 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    orb %cl, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    bsrl %eax, %edx
+; X86-NEXT:    bsrl %esi, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    addl $32, %edx
-; X86-NEXT:    bsrl %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    bsrl %eax, %ecx
 ; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl $32, %ecx
 ; X86-NEXT:    testl %esi, %esi
-; X86-NEXT:    cmovel %edx, %ecx
+; X86-NEXT:    cmovnel %edx, %ecx
 ; X86-NEXT:    bsrl %ebx, %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    bsrl %edi, %edi
 ; X86-NEXT:    xorl $31, %edi
-; X86-NEXT:    addl $32, %edi
+; X86-NEXT:    orl $32, %edi
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %edx, %edi
-; X86-NEXT:    addl $64, %edi
+; X86-NEXT:    orl $64, %edi
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl %esi, %edx
 ; X86-NEXT:    cmovnel %ecx, %edi
-; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    addl $32, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    bsrl %eax, %ecx
+; X86-NEXT:    bsrl %eax, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl $32, %ecx
 ; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    cmovel %edx, %ecx
+; X86-NEXT:    cmovnel %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    bsrl %ebx, %esi
 ; X86-NEXT:    xorl $31, %esi
 ; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    addl $32, %edx
+; X86-NEXT:    orl $32, %edx
 ; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %esi, %edx
-; X86-NEXT:    addl $64, %edx
+; X86-NEXT:    orl $64, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    orl %eax, %esi
 ; X86-NEXT:    cmovnel %ecx, %edx
@@ -380,9 +379,9 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -488,13 +487,13 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    sbbl %ecx, %ebx
 ; X86-NEXT:    sbbl %ecx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%ebp), %ecx
+; X86-NEXT:    movl 56(%ebp), %ecx
 ; X86-NEXT:    movl %edx, (%ecx)
 ; X86-NEXT:    movl %eax, 4(%ecx)
 ; X86-NEXT:    movl %ebx, 8(%ecx)
 ; X86-NEXT:    movl %esi, 12(%ecx)
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    movl 40(%ebp), %ecx
 ; X86-NEXT:    movl %ebx, %edi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -508,7 +507,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 44(%ebp), %esi
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -523,17 +522,17 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl 40(%ebp), %eax
 ; X86-NEXT:    imull %eax, %ebx
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    imull %esi, %edi
 ; X86-NEXT:    addl %edx, %edi
 ; X86-NEXT:    addl %ebx, %edi
-; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %eax
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    movl 52(%ebp), %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    imull %edx, %ebx
 ; X86-NEXT:    mull %edx
@@ -543,13 +542,13 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    adcl %edi, %ebx
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl 12(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %edx
 ; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %ecx
 ; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl 20(%ebp), %edi
+; X86-NEXT:    movl 32(%ebp), %edi
 ; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %esi
 ; X86-NEXT:    sbbl %ebx, %esi
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edx, (%eax)
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index db6136c..859e924 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -152,60 +152,60 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $160, %esp
-; X86-NEXT:    movl 28(%ebp), %ebx
-; X86-NEXT:    movl 40(%ebp), %esi
-; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    movl 44(%ebp), %edi
 ; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    orl %esi, %eax
 ; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    orl 36(%ebp), %ecx
+; X86-NEXT:    orl 48(%ebp), %ecx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %cl
-; X86-NEXT:    movl 16(%ebp), %eax
-; X86-NEXT:    orl 24(%ebp), %eax
-; X86-NEXT:    movl 12(%ebp), %edx
-; X86-NEXT:    orl 20(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    orl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    orl 32(%ebp), %edx
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    orb %cl, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    bsrl %esi, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    bsrl 36(%ebp), %ecx
+; X86-NEXT:    bsrl 48(%ebp), %ecx
 ; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:    addl $32, %ecx
+; X86-NEXT:    orl $32, %ecx
 ; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    cmovnel %edx, %ecx
 ; X86-NEXT:    bsrl %edi, %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    bsrl %ebx, %eax
 ; X86-NEXT:    xorl $31, %eax
-; X86-NEXT:    addl $32, %eax
+; X86-NEXT:    orl $32, %eax
 ; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    cmovnel %edx, %eax
-; X86-NEXT:    addl $64, %eax
-; X86-NEXT:    movl 36(%ebp), %edx
+; X86-NEXT:    orl $64, %eax
+; X86-NEXT:    movl 48(%ebp), %edx
 ; X86-NEXT:    orl %esi, %edx
 ; X86-NEXT:    cmovnel %ecx, %eax
-; X86-NEXT:    movl 24(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %ebx
 ; X86-NEXT:    bsrl %ebx, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl 32(%ebp), %ecx
 ; X86-NEXT:    bsrl %ecx, %ecx
 ; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:    addl $32, %ecx
+; X86-NEXT:    orl $32, %ecx
 ; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    movl 16(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %edi
 ; X86-NEXT:    bsrl %edi, %esi
 ; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    bsrl 12(%ebp), %edx
+; X86-NEXT:    bsrl 24(%ebp), %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    addl $32, %edx
+; X86-NEXT:    orl $32, %edx
 ; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    cmovnel %esi, %edx
-; X86-NEXT:    addl $64, %edx
-; X86-NEXT:    movl 20(%ebp), %esi
+; X86-NEXT:    orl $64, %edx
+; X86-NEXT:    movl 32(%ebp), %esi
 ; X86-NEXT:    orl %ebx, %esi
 ; X86-NEXT:    cmovnel %ecx, %edx
 ; X86-NEXT:    xorl %edi, %edi
@@ -230,15 +230,15 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    sbbl %esi, %edx
 ; X86-NEXT:    setb %dl
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload
-; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 36(%ebp), %eax
 ; X86-NEXT:    cmovnel %edi, %eax
-; X86-NEXT:    movl 20(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %esi
 ; X86-NEXT:    cmovnel %edi, %esi
-; X86-NEXT:    movl 16(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %edx
 ; X86-NEXT:    cmovnel %edi, %edx
-; X86-NEXT:    movl 12(%ebp), %ebx
+; X86-NEXT:    movl 24(%ebp), %ebx
 ; X86-NEXT:    cmovnel %edi, %ebx
-; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    movl 56(%ebp), %edi
 ; X86-NEXT:    jne .LBB4_8
 ; X86-NEXT:  # %bb.1: # %_udiv-special-cases
 ; X86-NEXT:    movl %eax, %edi
@@ -249,18 +249,18 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl 44(%ebp), %edi
-; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl 56(%ebp), %edi
+; X86-NEXT:    movl 24(%ebp), %ecx
 ; X86-NEXT:    je .LBB4_8
 ; X86-NEXT:  # %bb.2: # %udiv-bb1
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    xorps %xmm0, %xmm0
 ; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 20(%ebp), %eax
+; X86-NEXT:    movl 32(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 36(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %ecx
@@ -293,13 +293,13 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    jmp .LBB4_7
 ; X86-NEXT:  .LBB4_3: # %udiv-preheader
 ; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 12(%ebp), %edi
+; X86-NEXT:    movl 24(%ebp), %edi
 ; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 16(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %edi
 ; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 20(%ebp), %edi
+; X86-NEXT:    movl 32(%ebp), %edi
 ; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 36(%ebp), %edi
 ; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -326,16 +326,16 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrdl %cl, %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl 40(%ebp), %eax
 ; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    movl 44(%ebp), %eax
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %eax
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    movl 52(%ebp), %eax
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -378,12 +378,12 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    andl 40(%ebp), %esi
+; X86-NEXT:    andl 52(%ebp), %esi
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl 36(%ebp), %eax
+; X86-NEXT:    andl 48(%ebp), %eax
 ; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    andl 32(%ebp), %edx
-; X86-NEXT:    andl 28(%ebp), %ecx
+; X86-NEXT:    andl 44(%ebp), %edx
+; X86-NEXT:    andl 40(%ebp), %ecx
 ; X86-NEXT:    subl %ecx, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %edx, %ebx
@@ -413,7 +413,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    movl 56(%ebp), %edi
 ; X86-NEXT:  .LBB4_7: # %udiv-loop-exit
 ; X86-NEXT:    shldl $1, %esi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -432,23 +432,23 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %esi, 8(%edi)
 ; X86-NEXT:    movl %eax, 12(%edi)
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %eax
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    imull %edx, %esi
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl 40(%ebp), %edi
+; X86-NEXT:    movl 52(%ebp), %edi
 ; X86-NEXT:    imull %ebx, %edi
 ; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl 40(%ebp), %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    imull 28(%ebp), %ecx
+; X86-NEXT:    imull 40(%ebp), %ecx
 ; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    movl 44(%ebp), %eax
 ; X86-NEXT:    imull %eax, %ebx
 ; X86-NEXT:    addl %ecx, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
@@ -457,7 +457,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    movl 40(%ebp), %ecx
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -468,26 +468,26 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    addl %edi, %ecx
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull 32(%ebp)
-; X86-NEXT:    movl 16(%ebp), %esi
+; X86-NEXT:    mull 44(%ebp)
+; X86-NEXT:    movl 28(%ebp), %esi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ebx, %edi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull 32(%ebp)
+; X86-NEXT:    mull 44(%ebp)
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 12(%ebp), %ebx
+; X86-NEXT:    movl 24(%ebp), %ebx
 ; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl 20(%ebp), %edi
+; X86-NEXT:    movl 32(%ebp), %edi
 ; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %ecx
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ebx, (%eax)
diff --git a/llvm/test/CodeGen/X86/embed-bitcode.ll b/llvm/test/CodeGen/X86/embed-bitcode.ll
index 0d66ba8..d4af954 100644
--- a/llvm/test/CodeGen/X86/embed-bitcode.ll
+++ b/llvm/test/CodeGen/X86/embed-bitcode.ll
@@ -1,10 +1,23 @@
 ; RUN: llc -filetype=obj -mtriple=x86_64 %s -o %t
 ; RUN: llvm-readelf -S %t | FileCheck %s
+; RUN: llc -filetype=obj -mtriple=x86_64-pc-windows-msvc %s -o %t
+; RUN: llvm-readobj -S %t | FileCheck %s --check-prefix=COFF
 
 ; CHECK:      .text    PROGBITS 0000000000000000 [[#%x,OFF:]] 000000 00 AX 0
 ; CHECK-NEXT: .llvmbc  PROGBITS 0000000000000000 [[#%x,OFF:]] 000004 00    0
 ; CHECK-NEXT: .llvmcmd PROGBITS 0000000000000000 [[#%x,OFF:]] 000005 00    0
 
+; COFF:      Name: .llvmbc (2E 6C 6C 76 6D 62 63 00)
+; COFF:      Characteristics [
+; COFF-NEXT:   IMAGE_SCN_ALIGN_1BYTES
+; COFF-NEXT:   IMAGE_SCN_MEM_DISCARDABLE
+; COFF-NEXT: ]
+; COFF:      Name: .llvmcmd (2E 6C 6C 76 6D 63 6D 64)
+; COFF:      Characteristics [
+; COFF-NEXT:   IMAGE_SCN_ALIGN_1BYTES
+; COFF-NEXT:   IMAGE_SCN_MEM_DISCARDABLE
+; COFF-NEXT: ]
+
 @llvm.embedded.module = private constant [4 x i8] c"BC\C0\DE", section ".llvmbc", align 1
 @llvm.cmdline = private constant [5 x i8] c"-cc1\00", section ".llvmcmd", align 1
 @llvm.compiler.used = appending global [2 x ptr] [ptr @llvm.embedded.module, ptr @llvm.cmdline], section "llvm.metadata"
diff --git a/llvm/test/CodeGen/X86/fp128-cast-strict.ll b/llvm/test/CodeGen/X86/fp128-cast-strict.ll
index 707b05f..bb5640a 100644
--- a/llvm/test/CodeGen/X86/fp128-cast-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-cast-strict.ll
@@ -481,18 +481,21 @@ define i128 @fptosi_i128(fp128 %x) nounwind strictfp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __fixtfti
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -501,7 +504,7 @@ define i128 @fptosi_i128(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movl %eax, (%esi)
 ; X86-NEXT:    movl %ecx, 4(%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
@@ -620,18 +623,21 @@ define i128 @fptoui_i128(fp128 %x) nounwind strictfp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __fixunstfti
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -640,7 +646,7 @@ define i128 @fptoui_i128(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movl %eax, (%esi)
 ; X86-NEXT:    movl %ecx, 4(%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
@@ -818,18 +824,21 @@ define fp128 @sitofp_i128(i128 %x) nounwind strictfp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __floattitf
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -838,7 +847,7 @@ define fp128 @sitofp_i128(i128 %x) nounwind strictfp {
 ; X86-NEXT:    movl %eax, (%esi)
 ; X86-NEXT:    movl %ecx, 4(%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
@@ -1016,18 +1025,21 @@ define fp128 @uitofp_i128(i128 %x) nounwind strictfp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __floatuntitf
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1036,7 +1048,7 @@ define fp128 @uitofp_i128(i128 %x) nounwind strictfp {
 ; X86-NEXT:    movl %eax, (%esi)
 ; X86-NEXT:    movl %ecx, 4(%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
diff --git a/llvm/test/CodeGen/X86/fp128-cast.ll b/llvm/test/CodeGen/X86/fp128-cast.ll
index 1de2484..6d4ec06 100644
--- a/llvm/test/CodeGen/X86/fp128-cast.ll
+++ b/llvm/test/CodeGen/X86/fp128-cast.ll
@@ -415,16 +415,20 @@ define dso_local void @TestFPToSIF128_I128() nounwind {
 ; X86-LABEL: TestFPToSIF128_I128:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl vf128, %eax
+; X86-NEXT:    movl vf128+4, %ecx
+; X86-NEXT:    movl vf128+8, %edx
+; X86-NEXT:    movl vf128+12, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __fixtfti
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -432,7 +436,7 @@ define dso_local void @TestFPToSIF128_I128() nounwind {
 ; X86-NEXT:    movl %edx, vi128+8
 ; X86-NEXT:    movl %ecx, vi128+4
 ; X86-NEXT:    movl %eax, vi128
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -466,16 +470,20 @@ define dso_local void @TestFPToUIF128_U128() nounwind {
 ; X86-LABEL: TestFPToUIF128_U128:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl vf128, %eax
+; X86-NEXT:    movl vf128+4, %ecx
+; X86-NEXT:    movl vf128+8, %edx
+; X86-NEXT:    movl vf128+12, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __fixunstfti
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -483,7 +491,7 @@ define dso_local void @TestFPToUIF128_U128() nounwind {
 ; X86-NEXT:    movl %edx, vu128+8
 ; X86-NEXT:    movl %ecx, vu128+4
 ; X86-NEXT:    movl %eax, vu128
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -913,16 +921,20 @@ define dso_local void @TestSIToFPI128_F128() nounwind {
 ; X86-LABEL: TestSIToFPI128_F128:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl vi128, %eax
+; X86-NEXT:    movl vi128+4, %ecx
+; X86-NEXT:    movl vi128+8, %edx
+; X86-NEXT:    movl vi128+12, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl vi128+12
-; X86-NEXT:    pushl vi128+8
-; X86-NEXT:    pushl vi128+4
-; X86-NEXT:    pushl vi128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __floattitf
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -930,7 +942,7 @@ define dso_local void @TestSIToFPI128_F128() nounwind {
 ; X86-NEXT:    movl %edx, vf128+8
 ; X86-NEXT:    movl %ecx, vf128+4
 ; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -964,16 +976,20 @@ define dso_local void @TestUIToFPU128_F128() #2 {
 ; X86-LABEL: TestUIToFPU128_F128:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl vu128, %eax
+; X86-NEXT:    movl vu128+4, %ecx
+; X86-NEXT:    movl vu128+8, %edx
+; X86-NEXT:    movl vu128+12, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl vu128+12
-; X86-NEXT:    pushl vu128+8
-; X86-NEXT:    pushl vu128+4
-; X86-NEXT:    pushl vu128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __floatuntitf
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -981,7 +997,7 @@ define dso_local void @TestUIToFPU128_F128() #2 {
 ; X86-NEXT:    movl %edx, vf128+8
 ; X86-NEXT:    movl %ecx, vf128+4
 ; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -1134,33 +1150,30 @@ define dso_local i32 @TestBits128(fp128 %ld) nounwind {
 ;
 ; X86-LABEL: TestBits128:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $72, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %edx
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __multf3
-; X86-NEXT:    addl $44, %esp
+; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    orl (%esp), %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    sete %al
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $72, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestBits128:
@@ -1359,12 +1372,14 @@ define i1 @PR34866(i128 %x) nounwind {
 ;
 ; X86-LABEL: PR34866:
 ; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    sete %al
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: PR34866:
@@ -1394,12 +1409,14 @@ define i1 @PR34866_commute(i128 %x) nounwind {
 ;
 ; X86-LABEL: PR34866_commute:
 ; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    sete %al
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: PR34866_commute:
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
index a7eea04..ad2d690 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
@@ -41,27 +41,40 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: add:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __addtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: add:
@@ -81,24 +94,32 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___addtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -107,9 +128,10 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -141,27 +163,40 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: sub:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __subtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: sub:
@@ -181,24 +216,32 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___subtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -207,9 +250,10 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -241,27 +285,40 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: mul:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __multf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: mul:
@@ -281,24 +338,32 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___multf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -307,9 +372,10 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -341,27 +407,40 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: div:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __divtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: div:
@@ -381,24 +460,32 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___divtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -407,9 +494,10 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -434,31 +522,48 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp {
 ;
 ; X86-LABEL: fma:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $92, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmaf128
-; X86-NEXT:    addl $60, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
-; X86-NEXT:    movaps %xmm0, (%esi)
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%ebp)
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    addl $92, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: fma:
@@ -481,28 +586,40 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $96, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 56(%ebp)
-; WIN-X86-NEXT:    pushl 52(%ebp)
-; WIN-X86-NEXT:    pushl 48(%ebp)
-; WIN-X86-NEXT:    pushl 44(%ebp)
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 52(%ebp), %ebx
+; WIN-X86-NEXT:    movl 56(%ebp), %edi
+; WIN-X86-NEXT:    movl 60(%ebp), %edx
+; WIN-X86-NEXT:    movl 64(%ebp), %ecx
+; WIN-X86-NEXT:    movl 68(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 48(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 44(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 40(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmal
-; WIN-X86-NEXT:    addl $52, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -511,9 +628,10 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -538,27 +656,40 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: frem:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmodf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: frem:
@@ -578,24 +709,32 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmodl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -604,9 +743,10 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -631,23 +771,28 @@ define fp128 @ceil(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: ceil:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll ceilf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: ceil:
@@ -667,17 +812,20 @@ define fp128 @ceil(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _ceill
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -713,23 +861,28 @@ define fp128 @acos(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: acos:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll acosf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: acos:
@@ -749,17 +902,20 @@ define fp128 @acos(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _acosl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -795,23 +951,28 @@ define fp128 @cos(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: cos:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll cosf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: cos:
@@ -831,17 +992,20 @@ define fp128 @cos(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _cosl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -877,23 +1041,28 @@ define fp128 @cosh(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: cosh:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll coshf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: cosh:
@@ -913,17 +1082,20 @@ define fp128 @cosh(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _coshl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -959,23 +1131,28 @@ define fp128 @exp(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: exp:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll expf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: exp:
@@ -995,17 +1172,20 @@ define fp128 @exp(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _expl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1041,23 +1221,28 @@ define fp128 @exp2(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: exp2:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll exp2f128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: exp2:
@@ -1077,17 +1262,20 @@ define fp128 @exp2(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _exp2l
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1123,23 +1311,28 @@ define fp128 @floor(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: floor:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll floorf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: floor:
@@ -1159,17 +1352,20 @@ define fp128 @floor(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _floorl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1205,23 +1401,28 @@ define fp128 @log(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: log:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll logf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: log:
@@ -1241,17 +1442,20 @@ define fp128 @log(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _logl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1287,23 +1491,28 @@ define fp128 @log10(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: log10:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll log10f128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: log10:
@@ -1323,17 +1532,20 @@ define fp128 @log10(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _log10l
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1369,23 +1581,28 @@ define fp128 @log2(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: log2:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll log2f128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: log2:
@@ -1405,17 +1622,20 @@ define fp128 @log2(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _log2l
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1451,27 +1671,40 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: maxnum:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmaxf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: maxnum:
@@ -1491,24 +1724,32 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmaxl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1517,9 +1758,10 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1544,27 +1786,40 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: minnum:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fminf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: minnum:
@@ -1584,24 +1839,32 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fminl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1610,9 +1873,10 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1637,23 +1901,28 @@ define fp128 @nearbyint(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: nearbyint:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll nearbyintf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: nearbyint:
@@ -1673,17 +1942,20 @@ define fp128 @nearbyint(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _nearbyintl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1719,27 +1991,40 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: pow:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll powf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: pow:
@@ -1759,24 +2044,32 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _powl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1785,9 +2078,10 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1819,24 +2113,32 @@ define fp128 @powi(fp128 %x, i32 %y) nounwind strictfp {
 ;
 ; X86-LABEL: powi:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $64, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __powitf2
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $64, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: powi:
@@ -1853,21 +2155,26 @@ define fp128 @powi(fp128 %x, i32 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___powitf2
-; WIN-X86-NEXT:    addl $24, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1876,9 +2183,10 @@ define fp128 @powi(fp128 %x, i32 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1903,23 +2211,28 @@ define fp128 @rint(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: rint:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll rintf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: rint:
@@ -1939,17 +2252,20 @@ define fp128 @rint(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _rintl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1985,23 +2301,28 @@ define fp128 @round(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: round:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll roundf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: round:
@@ -2021,17 +2342,20 @@ define fp128 @round(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _roundl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2067,23 +2391,28 @@ define fp128 @roundeven(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: roundeven:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll roundevenf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: roundeven:
@@ -2103,17 +2432,20 @@ define fp128 @roundeven(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _roundevenl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2149,23 +2481,28 @@ define fp128 @asin(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: asin:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll asinf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: asin:
@@ -2185,17 +2522,20 @@ define fp128 @asin(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _asinl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2231,23 +2571,28 @@ define fp128 @sin(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: sin:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sinf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: sin:
@@ -2267,17 +2612,20 @@ define fp128 @sin(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sinl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2313,23 +2661,28 @@ define fp128 @sinh(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: sinh:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sinhf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: sinh:
@@ -2349,17 +2702,20 @@ define fp128 @sinh(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sinhl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2395,23 +2751,28 @@ define fp128 @sqrt(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: sqrt:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sqrtf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: sqrt:
@@ -2431,17 +2792,20 @@ define fp128 @sqrt(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sqrtl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2477,23 +2841,28 @@ define fp128 @atan(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: atan:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll atanf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: atan:
@@ -2513,17 +2882,20 @@ define fp128 @atan(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _atanl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2559,27 +2931,40 @@ define fp128 @atan2(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: atan2:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll atan2f128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: atan2:
@@ -2599,24 +2984,32 @@ define fp128 @atan2(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _atan2l
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2625,9 +3018,10 @@ define fp128 @atan2(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -2652,23 +3046,28 @@ define fp128 @tan(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: tan:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll tanf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: tan:
@@ -2688,17 +3087,20 @@ define fp128 @tan(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _tanl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2734,23 +3136,28 @@ define fp128 @tanh(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: tanh:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll tanhf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: tanh:
@@ -2770,17 +3177,20 @@ define fp128 @tanh(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _tanhl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2816,23 +3226,28 @@ define fp128 @trunc(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: trunc:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll truncf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: trunc:
@@ -2852,17 +3267,20 @@ define fp128 @trunc(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _truncl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2919,12 +3337,18 @@ define i32 @lrint(fp128 %x) nounwind strictfp {
 ;
 ; WIN-X86-LABEL: lrint:
 ; WIN-X86:       # %bb.0: # %entry
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl 8(%ebp)
 ; WIN-X86-NEXT:    calll _lrintl
 ; WIN-X86-NEXT:    addl $16, %esp
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
   %rint = call i32 @llvm.experimental.constrained.lrint.i32.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -2969,12 +3393,18 @@ define i64 @llrint(fp128 %x) nounwind strictfp {
 ;
 ; WIN-X86-LABEL: llrint:
 ; WIN-X86:       # %bb.0: # %entry
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl 8(%ebp)
 ; WIN-X86-NEXT:    calll _llrintl
 ; WIN-X86-NEXT:    addl $16, %esp
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
   %rint = call i64 @llvm.experimental.constrained.llrint.i64.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -3019,12 +3449,18 @@ define i32 @lround(fp128 %x) nounwind strictfp {
 ;
 ; WIN-X86-LABEL: lround:
 ; WIN-X86:       # %bb.0: # %entry
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl 8(%ebp)
 ; WIN-X86-NEXT:    calll _lroundl
 ; WIN-X86-NEXT:    addl $16, %esp
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
   %round = call i32 @llvm.experimental.constrained.lround.i32.f128(fp128 %x, metadata !"fpexcept.strict") #0
@@ -3069,12 +3505,18 @@ define i64 @llround(fp128 %x) nounwind strictfp {
 ;
 ; WIN-X86-LABEL: llround:
 ; WIN-X86:       # %bb.0: # %entry
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl 8(%ebp)
 ; WIN-X86-NEXT:    calll _llroundl
 ; WIN-X86-NEXT:    addl $16, %esp
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
   %round = call i64 @llvm.experimental.constrained.llround.i64.f128(fp128 %x, metadata !"fpexcept.strict") #0
@@ -3176,26 +3618,32 @@ define i64 @cmp(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ;
 ; WIN-X86-LABEL: cmp:
 ; WIN-X86:       # %bb.0:
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
+; WIN-X86-NEXT:    pushl 36(%ebp)
+; WIN-X86-NEXT:    pushl 32(%ebp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___eqtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    testl %eax, %eax
 ; WIN-X86-NEXT:    je LBB37_1
 ; WIN-X86-NEXT:  # %bb.2:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 16(%ebp), %ecx
 ; WIN-X86-NEXT:    jmp LBB37_3
 ; WIN-X86-NEXT:  LBB37_1:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 8(%ebp), %ecx
 ; WIN-X86-NEXT:  LBB37_3:
 ; WIN-X86-NEXT:    movl (%ecx), %eax
 ; WIN-X86-NEXT:    movl 4(%ecx), %edx
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
   %cond = call i1 @llvm.experimental.constrained.fcmp.f128(
                                                fp128 %x, fp128 %y,
@@ -3300,26 +3748,32 @@ define i64 @cmps(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ;
 ; WIN-X86-LABEL: cmps:
 ; WIN-X86:       # %bb.0:
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
+; WIN-X86-NEXT:    pushl 36(%ebp)
+; WIN-X86-NEXT:    pushl 32(%ebp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___eqtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    testl %eax, %eax
 ; WIN-X86-NEXT:    je LBB38_1
 ; WIN-X86-NEXT:  # %bb.2:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 16(%ebp), %ecx
 ; WIN-X86-NEXT:    jmp LBB38_3
 ; WIN-X86-NEXT:  LBB38_1:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 8(%ebp), %ecx
 ; WIN-X86-NEXT:  LBB38_3:
 ; WIN-X86-NEXT:    movl (%ecx), %eax
 ; WIN-X86-NEXT:    movl 4(%ecx), %edx
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
   %cond = call i1 @llvm.experimental.constrained.fcmps.f128(
                                                fp128 %x, fp128 %y,
@@ -3496,44 +3950,47 @@ define i64 @cmp_ueq_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; WIN-X86-LABEL: cmp_ueq_q:
 ; WIN-X86:       # %bb.0:
 ; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 32(%ebp), %edi
+; WIN-X86-NEXT:    movl 36(%ebp), %esi
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %ebp
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___eqtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    testl %eax, %eax
 ; WIN-X86-NEXT:    sete %bl
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %ebp
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___unordtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    orb %bl, %al
 ; WIN-X86-NEXT:    jne LBB39_1
 ; WIN-X86-NEXT:  # %bb.2:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 16(%ebp), %ecx
 ; WIN-X86-NEXT:    jmp LBB39_3
 ; WIN-X86-NEXT:  LBB39_1:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 8(%ebp), %ecx
 ; WIN-X86-NEXT:  LBB39_3:
 ; WIN-X86-NEXT:    movl (%ecx), %eax
 ; WIN-X86-NEXT:    movl 4(%ecx), %edx
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
 ; WIN-X86-NEXT:    popl %ebx
@@ -3716,32 +4173,34 @@ define i64 @cmp_one_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; WIN-X86-LABEL: cmp_one_q:
 ; WIN-X86:       # %bb.0:
 ; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 32(%ebp), %edi
+; WIN-X86-NEXT:    movl 36(%ebp), %esi
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %ebp
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___eqtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    testl %eax, %eax
 ; WIN-X86-NEXT:    setne %bl
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %ebp
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___unordtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    testl %eax, %eax
@@ -3749,13 +4208,14 @@ define i64 @cmp_one_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; WIN-X86-NEXT:    testb %bl, %al
 ; WIN-X86-NEXT:    jne LBB40_1
 ; WIN-X86-NEXT:  # %bb.2:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 16(%ebp), %ecx
 ; WIN-X86-NEXT:    jmp LBB40_3
 ; WIN-X86-NEXT:  LBB40_1:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 8(%ebp), %ecx
 ; WIN-X86-NEXT:  LBB40_3:
 ; WIN-X86-NEXT:    movl (%ecx), %eax
 ; WIN-X86-NEXT:    movl 4(%ecx), %edx
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
 ; WIN-X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls.ll b/llvm/test/CodeGen/X86/fp128-libcalls.ll
index f727a79..4b0449f 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls.ll
@@ -42,22 +42,38 @@ define dso_local void @Test128Add(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Add:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __addtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Add:
@@ -78,22 +94,31 @@ define dso_local void @Test128Add(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %edi
+; WIN-X86-NEXT:    movl 28(%ebp), %ebx
+; WIN-X86-NEXT:    movl 32(%ebp), %ecx
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___addtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -101,8 +126,10 @@ define dso_local void @Test128Add(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -144,22 +171,38 @@ define dso_local void @Test128_1Add(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Add:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl vf128, %edi
+; X86-NEXT:    movl vf128+4, %ebx
+; X86-NEXT:    movl vf128+8, %ebp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __addtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Add:
@@ -180,22 +223,31 @@ define dso_local void @Test128_1Add(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl _vf128+12
-; WIN-X86-NEXT:    pushl _vf128+8
-; WIN-X86-NEXT:    pushl _vf128+4
-; WIN-X86-NEXT:    pushl _vf128
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %esi
+; WIN-X86-NEXT:    movl 20(%ebp), %edi
+; WIN-X86-NEXT:    movl _vf128, %edx
+; WIN-X86-NEXT:    movl _vf128+4, %ebx
+; WIN-X86-NEXT:    movl _vf128+8, %ecx
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___addtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -203,8 +255,10 @@ define dso_local void @Test128_1Add(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -241,22 +295,38 @@ define dso_local void @Test128Sub(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Sub:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __subtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Sub:
@@ -277,22 +347,31 @@ define dso_local void @Test128Sub(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %edi
+; WIN-X86-NEXT:    movl 28(%ebp), %ebx
+; WIN-X86-NEXT:    movl 32(%ebp), %ecx
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___subtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -300,8 +379,10 @@ define dso_local void @Test128Sub(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -343,22 +424,38 @@ define dso_local void @Test128_1Sub(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Sub:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl vf128, %edi
+; X86-NEXT:    movl vf128+4, %ebx
+; X86-NEXT:    movl vf128+8, %ebp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __subtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Sub:
@@ -379,22 +476,31 @@ define dso_local void @Test128_1Sub(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl _vf128+12
-; WIN-X86-NEXT:    pushl _vf128+8
-; WIN-X86-NEXT:    pushl _vf128+4
-; WIN-X86-NEXT:    pushl _vf128
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %esi
+; WIN-X86-NEXT:    movl 20(%ebp), %edi
+; WIN-X86-NEXT:    movl _vf128, %edx
+; WIN-X86-NEXT:    movl _vf128+4, %ebx
+; WIN-X86-NEXT:    movl _vf128+8, %ecx
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___subtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -402,8 +508,10 @@ define dso_local void @Test128_1Sub(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -440,22 +548,38 @@ define dso_local void @Test128Mul(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Mul:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __multf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Mul:
@@ -476,22 +600,31 @@ define dso_local void @Test128Mul(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %edi
+; WIN-X86-NEXT:    movl 28(%ebp), %ebx
+; WIN-X86-NEXT:    movl 32(%ebp), %ecx
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___multf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -499,8 +632,10 @@ define dso_local void @Test128Mul(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -542,22 +677,38 @@ define dso_local void @Test128_1Mul(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Mul:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl vf128, %edi
+; X86-NEXT:    movl vf128+4, %ebx
+; X86-NEXT:    movl vf128+8, %ebp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __multf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Mul:
@@ -578,22 +729,31 @@ define dso_local void @Test128_1Mul(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl _vf128+12
-; WIN-X86-NEXT:    pushl _vf128+8
-; WIN-X86-NEXT:    pushl _vf128+4
-; WIN-X86-NEXT:    pushl _vf128
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %esi
+; WIN-X86-NEXT:    movl 20(%ebp), %edi
+; WIN-X86-NEXT:    movl _vf128, %edx
+; WIN-X86-NEXT:    movl _vf128+4, %ebx
+; WIN-X86-NEXT:    movl _vf128+8, %ecx
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___multf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -601,8 +761,10 @@ define dso_local void @Test128_1Mul(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -639,22 +801,38 @@ define dso_local void @Test128Div(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Div:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __divtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Div:
@@ -675,22 +853,31 @@ define dso_local void @Test128Div(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %edi
+; WIN-X86-NEXT:    movl 28(%ebp), %ebx
+; WIN-X86-NEXT:    movl 32(%ebp), %ecx
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___divtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -698,8 +885,10 @@ define dso_local void @Test128Div(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -741,22 +930,38 @@ define dso_local void @Test128_1Div(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Div:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl vf128, %edi
+; X86-NEXT:    movl vf128+4, %ebx
+; X86-NEXT:    movl vf128+8, %ebp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __divtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Div:
@@ -777,22 +982,31 @@ define dso_local void @Test128_1Div(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl _vf128+12
-; WIN-X86-NEXT:    pushl _vf128+8
-; WIN-X86-NEXT:    pushl _vf128+4
-; WIN-X86-NEXT:    pushl _vf128
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %esi
+; WIN-X86-NEXT:    movl 20(%ebp), %edi
+; WIN-X86-NEXT:    movl _vf128, %edx
+; WIN-X86-NEXT:    movl _vf128+4, %ebx
+; WIN-X86-NEXT:    movl _vf128+8, %ecx
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___divtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -800,8 +1014,10 @@ define dso_local void @Test128_1Div(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -830,22 +1046,38 @@ define dso_local void @Test128Rem(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Rem:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmodf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Rem:
@@ -866,22 +1098,31 @@ define dso_local void @Test128Rem(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %edi
+; WIN-X86-NEXT:    movl 28(%ebp), %ebx
+; WIN-X86-NEXT:    movl 32(%ebp), %ecx
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmodl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -889,8 +1130,10 @@ define dso_local void @Test128Rem(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -922,22 +1165,38 @@ define dso_local void @Test128_1Rem(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Rem:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl vf128, %edi
+; X86-NEXT:    movl vf128+4, %ebx
+; X86-NEXT:    movl vf128+8, %ebp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmodf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Rem:
@@ -958,22 +1217,31 @@ define dso_local void @Test128_1Rem(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl _vf128+12
-; WIN-X86-NEXT:    pushl _vf128+8
-; WIN-X86-NEXT:    pushl _vf128+4
-; WIN-X86-NEXT:    pushl _vf128
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %esi
+; WIN-X86-NEXT:    movl 20(%ebp), %edi
+; WIN-X86-NEXT:    movl _vf128, %edx
+; WIN-X86-NEXT:    movl _vf128+4, %ebx
+; WIN-X86-NEXT:    movl _vf128+8, %ecx
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmodl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -981,8 +1249,10 @@ define dso_local void @Test128_1Rem(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1011,18 +1281,24 @@ define dso_local void @Test128Sqrt(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Sqrt:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sqrtf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Sqrt:
@@ -1042,16 +1318,19 @@ define dso_local void @Test128Sqrt(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sqrtl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1089,18 +1368,24 @@ define dso_local void @Test128Sin(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Sin:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sinf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Sin:
@@ -1120,16 +1405,19 @@ define dso_local void @Test128Sin(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sinl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1167,18 +1455,24 @@ define dso_local void @Test128Cos(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Cos:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll cosf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Cos:
@@ -1198,16 +1492,19 @@ define dso_local void @Test128Cos(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _cosl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1245,18 +1542,24 @@ define dso_local void @Test128Ceil(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Ceil:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll ceilf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Ceil:
@@ -1276,16 +1579,19 @@ define dso_local void @Test128Ceil(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _ceill
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1323,18 +1629,24 @@ define dso_local void @Test128Floor(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Floor:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll floorf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Floor:
@@ -1354,16 +1666,19 @@ define dso_local void @Test128Floor(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _floorl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1401,18 +1716,24 @@ define dso_local void @Test128Trunc(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Trunc:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll truncf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Trunc:
@@ -1432,16 +1753,19 @@ define dso_local void @Test128Trunc(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _truncl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1479,18 +1803,24 @@ define dso_local void @Test128Nearbyint(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Nearbyint:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll nearbyintf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Nearbyint:
@@ -1510,16 +1840,19 @@ define dso_local void @Test128Nearbyint(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _nearbyintl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1557,18 +1890,24 @@ define dso_local void @Test128Rint(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Rint:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll rintf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Rint:
@@ -1588,16 +1927,19 @@ define dso_local void @Test128Rint(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _rintl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1635,18 +1977,24 @@ define dso_local void @Test128Round(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Round:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll roundf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Round:
@@ -1666,16 +2014,19 @@ define dso_local void @Test128Round(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _roundl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1705,31 +2056,48 @@ define fp128 @Test128FMA(fp128 %a, fp128 %b, fp128 %c) nounwind {
 ;
 ; X86-LABEL: Test128FMA:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $92, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmaf128
-; X86-NEXT:    addl $60, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
-; X86-NEXT:    movaps %xmm0, (%esi)
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%ebp)
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    addl $92, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128FMA:
@@ -1752,28 +2120,40 @@ define fp128 @Test128FMA(fp128 %a, fp128 %b, fp128 %c) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $96, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 56(%ebp)
-; WIN-X86-NEXT:    pushl 52(%ebp)
-; WIN-X86-NEXT:    pushl 48(%ebp)
-; WIN-X86-NEXT:    pushl 44(%ebp)
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 52(%ebp), %ebx
+; WIN-X86-NEXT:    movl 56(%ebp), %edi
+; WIN-X86-NEXT:    movl 60(%ebp), %edx
+; WIN-X86-NEXT:    movl 64(%ebp), %ecx
+; WIN-X86-NEXT:    movl 68(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 48(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 44(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 40(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmal
-; WIN-X86-NEXT:    addl $52, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1782,9 +2162,10 @@ define fp128 @Test128FMA(fp128 %a, fp128 %b, fp128 %c) nounwind {
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1804,23 +2185,28 @@ define fp128 @Test128Acos(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Acos:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll acosf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Acos:
@@ -1840,17 +2226,20 @@ define fp128 @Test128Acos(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _acosl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1879,23 +2268,28 @@ define fp128 @Test128Asin(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Asin:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll asinf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Asin:
@@ -1915,17 +2309,20 @@ define fp128 @Test128Asin(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _asinl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1954,23 +2351,28 @@ define fp128 @Test128Atan(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Atan:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll atanf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Atan:
@@ -1990,17 +2392,20 @@ define fp128 @Test128Atan(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _atanl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2029,27 +2434,40 @@ define fp128 @Test128Atan2(fp128 %a, fp128 %b) nounwind {
 ;
 ; X86-LABEL: Test128Atan2:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll atan2f128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Atan2:
@@ -2069,24 +2487,32 @@ define fp128 @Test128Atan2(fp128 %a, fp128 %b) nounwind {
 ; WIN-X86:       # %bb.0:
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _atan2l
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2095,9 +2521,10 @@ define fp128 @Test128Atan2(fp128 %a, fp128 %b) nounwind {
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
   %x = call fp128 @llvm.atan2.f128(fp128 %a, fp128 %b)
@@ -2115,23 +2542,28 @@ define fp128 @Test128Cosh(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Cosh:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll coshf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Cosh:
@@ -2151,17 +2583,20 @@ define fp128 @Test128Cosh(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _coshl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2190,23 +2625,28 @@ define fp128 @Test128Sinh(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Sinh:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sinhf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Sinh:
@@ -2226,17 +2666,20 @@ define fp128 @Test128Sinh(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sinhl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2265,23 +2708,28 @@ define fp128 @Test128Tan(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Tan:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll tanf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Tan:
@@ -2301,17 +2749,20 @@ define fp128 @Test128Tan(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _tanl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2340,23 +2791,28 @@ define fp128 @Test128Tanh(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Tanh:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll tanhf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Tanh:
@@ -2376,17 +2832,20 @@ define fp128 @Test128Tanh(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _tanhl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2425,27 +2884,34 @@ define { fp128, fp128 } @Test128Modf(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Modf:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    subl $80, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %ecx
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll modff128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm1
 ; X86-NEXT:    movaps %xmm1, 16(%esi)
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $40, %esp
+; X86-NEXT:    addl $80, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Modf:
@@ -2468,18 +2934,21 @@ define { fp128, fp128 } @Test128Modf(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    subl $112, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; WIN-X86-NEXT:    pushl %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %ecx
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _modfl
-; WIN-X86-NEXT:    addl $24, %esp
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll
index 0f66d42..953a5e7 100644
--- a/llvm/test/CodeGen/X86/freeze-vector.ll
+++ b/llvm/test/CodeGen/X86/freeze-vector.ll
@@ -171,15 +171,15 @@ define void @freeze_extractelement(ptr %origin0, ptr %origin1, ptr %dst) nounwin
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    vmovdqa (%ecx), %xmm0
-; X86-NEXT:    vpand (%edx), %xmm0, %xmm0
+; X86-NEXT:    vmovdqa (%edx), %xmm0
+; X86-NEXT:    vpand (%ecx), %xmm0, %xmm0
 ; X86-NEXT:    vpextrb $6, %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_extractelement:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa (%rsi), %xmm0
-; X64-NEXT:    vpand (%rdi), %xmm0, %xmm0
+; X64-NEXT:    vmovdqa (%rdi), %xmm0
+; X64-NEXT:    vpand (%rsi), %xmm0, %xmm0
 ; X64-NEXT:    vpextrb $6, %xmm0, (%rdx)
 ; X64-NEXT:    retq
   %i0 = load <16 x i8>, ptr %origin0
@@ -198,8 +198,8 @@ define void @freeze_extractelement_escape(ptr %origin0, ptr %origin1, ptr %dst,
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    vmovdqa (%edx), %xmm0
-; X86-NEXT:    vpand (%esi), %xmm0, %xmm0
+; X86-NEXT:    vmovdqa (%esi), %xmm0
+; X86-NEXT:    vpand (%edx), %xmm0, %xmm0
 ; X86-NEXT:    vmovdqa %xmm0, (%ecx)
 ; X86-NEXT:    vpextrb $6, %xmm0, (%eax)
 ; X86-NEXT:    popl %esi
@@ -207,8 +207,8 @@ define void @freeze_extractelement_escape(ptr %origin0, ptr %origin1, ptr %dst,
 ;
 ; X64-LABEL: freeze_extractelement_escape:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa (%rsi), %xmm0
-; X64-NEXT:    vpand (%rdi), %xmm0, %xmm0
+; X64-NEXT:    vmovdqa (%rdi), %xmm0
+; X64-NEXT:    vpand (%rsi), %xmm0, %xmm0
 ; X64-NEXT:    vmovdqa %xmm0, (%rcx)
 ; X64-NEXT:    vpextrb $6, %xmm0, (%rdx)
 ; X64-NEXT:    retq
@@ -239,8 +239,8 @@ define void @freeze_extractelement_extra_use(ptr %origin0, ptr %origin1, i64 %id
 ; X86-NEXT:    movl 32(%ebp), %edx
 ; X86-NEXT:    movl 12(%ebp), %esi
 ; X86-NEXT:    movl 8(%ebp), %edi
-; X86-NEXT:    vmovaps (%esi), %xmm0
-; X86-NEXT:    vandps (%edi), %xmm0, %xmm0
+; X86-NEXT:    vmovaps (%edi), %xmm0
+; X86-NEXT:    vandps (%esi), %xmm0, %xmm0
 ; X86-NEXT:    vmovaps %xmm0, (%esp)
 ; X86-NEXT:    movzbl (%esp,%ecx), %ecx
 ; X86-NEXT:    cmpb (%esp,%eax), %cl
@@ -255,8 +255,8 @@ define void @freeze_extractelement_extra_use(ptr %origin0, ptr %origin1, i64 %id
 ; X64:       # %bb.0:
 ; X64-NEXT:    andl $15, %ecx
 ; X64-NEXT:    andl $15, %edx
-; X64-NEXT:    vmovaps (%rsi), %xmm0
-; X64-NEXT:    vandps (%rdi), %xmm0, %xmm0
+; X64-NEXT:    vmovaps (%rdi), %xmm0
+; X64-NEXT:    vandps (%rsi), %xmm0, %xmm0
 ; X64-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movzbl -24(%rsp,%rdx), %eax
 ; X64-NEXT:    cmpb -24(%rsp,%rcx), %al
diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll
index e8c8ccf..ec1b8a3 100644
--- a/llvm/test/CodeGen/X86/fshl.ll
+++ b/llvm/test/CodeGen/X86/fshl.ll
@@ -264,53 +264,62 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-FAST-LABEL: var_shift_i128:
 ; X86-FAST:       # %bb.0:
 ; X86-FAST-NEXT:    pushl %ebp
+; X86-FAST-NEXT:    movl %esp, %ebp
 ; X86-FAST-NEXT:    pushl %ebx
 ; X86-FAST-NEXT:    pushl %edi
 ; X86-FAST-NEXT:    pushl %esi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-FAST-NEXT:    andl $-16, %esp
+; X86-FAST-NEXT:    subl $16, %esp
+; X86-FAST-NEXT:    movl 24(%ebp), %edi
+; X86-FAST-NEXT:    movl 28(%ebp), %edx
+; X86-FAST-NEXT:    movl 48(%ebp), %esi
+; X86-FAST-NEXT:    movl 56(%ebp), %ecx
 ; X86-FAST-NEXT:    testb $64, %cl
+; X86-FAST-NEXT:    movl 52(%ebp), %eax
 ; X86-FAST-NEXT:    jne .LBB6_1
 ; X86-FAST-NEXT:  # %bb.2:
-; X86-FAST-NEXT:    movl %ebx, %ebp
 ; X86-FAST-NEXT:    movl %esi, %ebx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-FAST-NEXT:    movl %edi, %eax
-; X86-FAST-NEXT:    movl %edx, %edi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT:    movl %edi, %esi
+; X86-FAST-NEXT:    movl 32(%ebp), %edi
+; X86-FAST-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-FAST-NEXT:    movl %edx, %eax
+; X86-FAST-NEXT:    movl 36(%ebp), %edx
 ; X86-FAST-NEXT:    testb $32, %cl
 ; X86-FAST-NEXT:    je .LBB6_5
 ; X86-FAST-NEXT:  .LBB6_4:
-; X86-FAST-NEXT:    movl %esi, %edx
-; X86-FAST-NEXT:    movl %edi, %esi
-; X86-FAST-NEXT:    movl %ebx, %edi
-; X86-FAST-NEXT:    movl %eax, %ebx
+; X86-FAST-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %esi, %eax
+; X86-FAST-NEXT:    movl (%esp), %esi # 4-byte Reload
 ; X86-FAST-NEXT:    jmp .LBB6_6
 ; X86-FAST-NEXT:  .LBB6_1:
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    movl 44(%ebp), %ebx
+; X86-FAST-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-FAST-NEXT:    movl 40(%ebp), %ebx
 ; X86-FAST-NEXT:    testb $32, %cl
 ; X86-FAST-NEXT:    jne .LBB6_4
 ; X86-FAST-NEXT:  .LBB6_5:
-; X86-FAST-NEXT:    movl %eax, %ebp
+; X86-FAST-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-FAST-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-FAST-NEXT:  .LBB6_6:
-; X86-FAST-NEXT:    movl %ebx, %eax
-; X86-FAST-NEXT:    shldl %cl, %ebp, %eax
-; X86-FAST-NEXT:    movl %edi, %ebp
-; X86-FAST-NEXT:    shldl %cl, %ebx, %ebp
-; X86-FAST-NEXT:    movl %esi, %ebx
-; X86-FAST-NEXT:    shldl %cl, %edi, %ebx
+; X86-FAST-NEXT:    movl %esi, %edi
+; X86-FAST-NEXT:    shldl %cl, %ebx, %edi
+; X86-FAST-NEXT:    movl %eax, %edx
+; X86-FAST-NEXT:    movl %eax, %ebx
+; X86-FAST-NEXT:    shldl %cl, %esi, %ebx
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-FAST-NEXT:    movl %eax, %esi
+; X86-FAST-NEXT:    shldl %cl, %edx, %esi
 ; X86-FAST-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-FAST-NEXT:    shldl %cl, %esi, %edx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-FAST-NEXT:    movl %edx, 12(%ecx)
-; X86-FAST-NEXT:    movl %ebx, 8(%ecx)
-; X86-FAST-NEXT:    movl %ebp, 4(%ecx)
-; X86-FAST-NEXT:    movl %eax, (%ecx)
-; X86-FAST-NEXT:    movl %ecx, %eax
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-FAST-NEXT:    shldl %cl, %eax, %edx
+; X86-FAST-NEXT:    movl 8(%ebp), %eax
+; X86-FAST-NEXT:    movl %edx, 12(%eax)
+; X86-FAST-NEXT:    movl %esi, 8(%eax)
+; X86-FAST-NEXT:    movl %ebx, 4(%eax)
+; X86-FAST-NEXT:    movl %edi, (%eax)
+; X86-FAST-NEXT:    leal -12(%ebp), %esp
 ; X86-FAST-NEXT:    popl %esi
 ; X86-FAST-NEXT:    popl %edi
 ; X86-FAST-NEXT:    popl %ebx
@@ -320,77 +329,91 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-SLOW-LABEL: var_shift_i128:
 ; X86-SLOW:       # %bb.0:
 ; X86-SLOW-NEXT:    pushl %ebp
+; X86-SLOW-NEXT:    movl %esp, %ebp
 ; X86-SLOW-NEXT:    pushl %ebx
 ; X86-SLOW-NEXT:    pushl %edi
 ; X86-SLOW-NEXT:    pushl %esi
-; X86-SLOW-NEXT:    pushl %eax
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT:    testb $64, %al
+; X86-SLOW-NEXT:    andl $-16, %esp
+; X86-SLOW-NEXT:    subl $32, %esp
+; X86-SLOW-NEXT:    movl 24(%ebp), %esi
+; X86-SLOW-NEXT:    movl 28(%ebp), %eax
+; X86-SLOW-NEXT:    movl 48(%ebp), %edx
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
+; X86-SLOW-NEXT:    testb $64, %cl
+; X86-SLOW-NEXT:    movl 52(%ebp), %edi
 ; X86-SLOW-NEXT:    jne .LBB6_1
 ; X86-SLOW-NEXT:  # %bb.2:
-; X86-SLOW-NEXT:    movl %edx, %ebp
-; X86-SLOW-NEXT:    movl %ebx, %edx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-SLOW-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %esi, %edx
+; X86-SLOW-NEXT:    movl 32(%ebp), %esi
 ; X86-SLOW-NEXT:    movl %edi, %ecx
-; X86-SLOW-NEXT:    movl %esi, %edi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT:    testb $32, %al
-; X86-SLOW-NEXT:    je .LBB6_5
-; X86-SLOW-NEXT:  .LBB6_4:
-; X86-SLOW-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %edi, %ebx
-; X86-SLOW-NEXT:    movl %edx, %edi
-; X86-SLOW-NEXT:    movl %ecx, %edx
-; X86-SLOW-NEXT:    jmp .LBB6_6
+; X86-SLOW-NEXT:    movl %eax, %edi
+; X86-SLOW-NEXT:    movl 36(%ebp), %eax
+; X86-SLOW-NEXT:    jmp .LBB6_3
 ; X86-SLOW-NEXT:  .LBB6_1:
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SLOW-NEXT:    testb $32, %al
+; X86-SLOW-NEXT:    movl 40(%ebp), %ecx
+; X86-SLOW-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl 44(%ebp), %ecx
+; X86-SLOW-NEXT:  .LBB6_3:
+; X86-SLOW-NEXT:    movl 56(%ebp), %ebx
+; X86-SLOW-NEXT:    testb $32, %bl
 ; X86-SLOW-NEXT:    jne .LBB6_4
-; X86-SLOW-NEXT:  .LBB6_5:
-; X86-SLOW-NEXT:    movl %ecx, %ebp
-; X86-SLOW-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:  # %bb.5:
+; X86-SLOW-NEXT:    movl %ecx, %ebx
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    jmp .LBB6_6
+; X86-SLOW-NEXT:  .LBB6_4:
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ecx, %edx
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-SLOW-NEXT:  .LBB6_6:
 ; X86-SLOW-NEXT:    movl %edx, %esi
-; X86-SLOW-NEXT:    movl %eax, %ecx
-; X86-SLOW-NEXT:    shll %cl, %esi
-; X86-SLOW-NEXT:    shrl %ebp
-; X86-SLOW-NEXT:    movb %al, %ch
-; X86-SLOW-NEXT:    notb %ch
-; X86-SLOW-NEXT:    movb %ch, %cl
-; X86-SLOW-NEXT:    shrl %cl, %ebp
-; X86-SLOW-NEXT:    orl %esi, %ebp
-; X86-SLOW-NEXT:    movl %edi, %esi
-; X86-SLOW-NEXT:    movb %al, %cl
-; X86-SLOW-NEXT:    shll %cl, %esi
-; X86-SLOW-NEXT:    shrl %edx
-; X86-SLOW-NEXT:    movb %ch, %cl
-; X86-SLOW-NEXT:    shrl %cl, %edx
-; X86-SLOW-NEXT:    orl %esi, %edx
-; X86-SLOW-NEXT:    movl %ebx, %esi
-; X86-SLOW-NEXT:    movb %al, %cl
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    shll %cl, %esi
+; X86-SLOW-NEXT:    movl %ebx, %edi
 ; X86-SLOW-NEXT:    shrl %edi
-; X86-SLOW-NEXT:    movb %ch, %cl
+; X86-SLOW-NEXT:    movl %ecx, %ebx
+; X86-SLOW-NEXT:    notb %bl
+; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-SLOW-NEXT:    shrl %cl, %edi
 ; X86-SLOW-NEXT:    orl %esi, %edi
-; X86-SLOW-NEXT:    movb %al, %cl
-; X86-SLOW-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SLOW-NEXT:    movl %esi, %eax
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
+; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT:    shll %cl, %eax
+; X86-SLOW-NEXT:    shrl %edx
+; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    shrl %cl, %edx
+; X86-SLOW-NEXT:    orl %eax, %edx
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-SLOW-NEXT:    movl %ebx, %eax
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
+; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT:    shll %cl, %eax
+; X86-SLOW-NEXT:    shrl %esi
+; X86-SLOW-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SLOW-NEXT:    shrl %cl, %esi
+; X86-SLOW-NEXT:    orl %eax, %esi
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
+; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-SLOW-NEXT:    shll %cl, %eax
 ; X86-SLOW-NEXT:    shrl %ebx
-; X86-SLOW-NEXT:    movb %ch, %cl
+; X86-SLOW-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-SLOW-NEXT:    shrl %cl, %ebx
 ; X86-SLOW-NEXT:    orl %eax, %ebx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    movl 8(%ebp), %eax
 ; X86-SLOW-NEXT:    movl %ebx, 12(%eax)
-; X86-SLOW-NEXT:    movl %edi, 8(%eax)
+; X86-SLOW-NEXT:    movl %esi, 8(%eax)
 ; X86-SLOW-NEXT:    movl %edx, 4(%eax)
-; X86-SLOW-NEXT:    movl %ebp, (%eax)
-; X86-SLOW-NEXT:    addl $4, %esp
+; X86-SLOW-NEXT:    movl %edi, (%eax)
+; X86-SLOW-NEXT:    leal -12(%ebp), %esp
 ; X86-SLOW-NEXT:    popl %esi
 ; X86-SLOW-NEXT:    popl %edi
 ; X86-SLOW-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll
index 4340f8f..544ab7f 100644
--- a/llvm/test/CodeGen/X86/fshr.ll
+++ b/llvm/test/CodeGen/X86/fshr.ll
@@ -258,51 +258,53 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-FAST-LABEL: var_shift_i128:
 ; X86-FAST:       # %bb.0:
 ; X86-FAST-NEXT:    pushl %ebp
+; X86-FAST-NEXT:    movl %esp, %ebp
 ; X86-FAST-NEXT:    pushl %ebx
 ; X86-FAST-NEXT:    pushl %edi
 ; X86-FAST-NEXT:    pushl %esi
-; X86-FAST-NEXT:    pushl %eax
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-FAST-NEXT:    andl $-16, %esp
+; X86-FAST-NEXT:    subl $16, %esp
+; X86-FAST-NEXT:    movl 24(%ebp), %esi
+; X86-FAST-NEXT:    movl 28(%ebp), %eax
+; X86-FAST-NEXT:    movl 48(%ebp), %edx
+; X86-FAST-NEXT:    movl 56(%ebp), %ecx
 ; X86-FAST-NEXT:    testb $64, %cl
+; X86-FAST-NEXT:    movl 52(%ebp), %ebx
 ; X86-FAST-NEXT:    je .LBB6_1
 ; X86-FAST-NEXT:  # %bb.2:
-; X86-FAST-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-FAST-NEXT:    movl %edi, %edx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-FAST-NEXT:    movl %esi, %ebp
-; X86-FAST-NEXT:    movl %ebx, %esi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-FAST-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %esi, %edx
+; X86-FAST-NEXT:    movl 32(%ebp), %esi
+; X86-FAST-NEXT:    movl %ebx, %edi
+; X86-FAST-NEXT:    movl %eax, %ebx
+; X86-FAST-NEXT:    movl 36(%ebp), %eax
 ; X86-FAST-NEXT:    testb $32, %cl
 ; X86-FAST-NEXT:    je .LBB6_4
 ; X86-FAST-NEXT:    jmp .LBB6_5
 ; X86-FAST-NEXT:  .LBB6_1:
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-FAST-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-FAST-NEXT:    movl 40(%ebp), %edi
+; X86-FAST-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl 44(%ebp), %edi
 ; X86-FAST-NEXT:    testb $32, %cl
 ; X86-FAST-NEXT:    jne .LBB6_5
 ; X86-FAST-NEXT:  .LBB6_4:
-; X86-FAST-NEXT:    movl %edi, %ebx
-; X86-FAST-NEXT:    movl %esi, %edi
-; X86-FAST-NEXT:    movl %edx, %esi
-; X86-FAST-NEXT:    movl %ebp, %edx
-; X86-FAST-NEXT:    movl (%esp), %ebp # 4-byte Reload
+; X86-FAST-NEXT:    movl %esi, %eax
+; X86-FAST-NEXT:    movl %ebx, %esi
+; X86-FAST-NEXT:    movl %edx, %ebx
+; X86-FAST-NEXT:    movl %edi, %edx
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-FAST-NEXT:  .LBB6_5:
-; X86-FAST-NEXT:    shrdl %cl, %edx, %ebp
-; X86-FAST-NEXT:    shrdl %cl, %esi, %edx
-; X86-FAST-NEXT:    shrdl %cl, %edi, %esi
+; X86-FAST-NEXT:    shrdl %cl, %edx, %edi
+; X86-FAST-NEXT:    shrdl %cl, %ebx, %edx
+; X86-FAST-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-FAST-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-FAST-NEXT:    shrdl %cl, %ebx, %edi
-; X86-FAST-NEXT:    movl %edi, 12(%eax)
-; X86-FAST-NEXT:    movl %esi, 8(%eax)
+; X86-FAST-NEXT:    shrdl %cl, %eax, %esi
+; X86-FAST-NEXT:    movl 8(%ebp), %eax
+; X86-FAST-NEXT:    movl %esi, 12(%eax)
+; X86-FAST-NEXT:    movl %ebx, 8(%eax)
 ; X86-FAST-NEXT:    movl %edx, 4(%eax)
-; X86-FAST-NEXT:    movl %ebp, (%eax)
-; X86-FAST-NEXT:    addl $4, %esp
+; X86-FAST-NEXT:    movl %edi, (%eax)
+; X86-FAST-NEXT:    leal -12(%ebp), %esp
 ; X86-FAST-NEXT:    popl %esi
 ; X86-FAST-NEXT:    popl %edi
 ; X86-FAST-NEXT:    popl %ebx
@@ -312,78 +314,88 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-SLOW-LABEL: var_shift_i128:
 ; X86-SLOW:       # %bb.0:
 ; X86-SLOW-NEXT:    pushl %ebp
+; X86-SLOW-NEXT:    movl %esp, %ebp
 ; X86-SLOW-NEXT:    pushl %ebx
 ; X86-SLOW-NEXT:    pushl %edi
 ; X86-SLOW-NEXT:    pushl %esi
-; X86-SLOW-NEXT:    subl $8, %esp
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SLOW-NEXT:    testb $64, %cl
+; X86-SLOW-NEXT:    andl $-16, %esp
+; X86-SLOW-NEXT:    subl $16, %esp
+; X86-SLOW-NEXT:    movl 24(%ebp), %edx
+; X86-SLOW-NEXT:    movl 28(%ebp), %esi
+; X86-SLOW-NEXT:    movl 48(%ebp), %ebx
+; X86-SLOW-NEXT:    movl 56(%ebp), %eax
+; X86-SLOW-NEXT:    testb $64, %al
+; X86-SLOW-NEXT:    movl 52(%ebp), %edi
 ; X86-SLOW-NEXT:    je .LBB6_1
 ; X86-SLOW-NEXT:  # %bb.2:
-; X86-SLOW-NEXT:    movl %ebp, %eax
-; X86-SLOW-NEXT:    movl %ebx, %ebp
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SLOW-NEXT:    movl %edi, %edx
+; X86-SLOW-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %edx, %ebx
+; X86-SLOW-NEXT:    movl 32(%ebp), %edx
+; X86-SLOW-NEXT:    movl %edi, %eax
 ; X86-SLOW-NEXT:    movl %esi, %edi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT:    testb $32, %cl
-; X86-SLOW-NEXT:    jne .LBB6_5
-; X86-SLOW-NEXT:  .LBB6_4:
-; X86-SLOW-NEXT:    movl %ebx, %esi
-; X86-SLOW-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %ebp, %edi
-; X86-SLOW-NEXT:    movl %edx, %ebp
-; X86-SLOW-NEXT:    movl %eax, %edx
-; X86-SLOW-NEXT:    jmp .LBB6_6
+; X86-SLOW-NEXT:    movl 36(%ebp), %esi
+; X86-SLOW-NEXT:    jmp .LBB6_3
 ; X86-SLOW-NEXT:  .LBB6_1:
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SLOW-NEXT:    movl 40(%ebp), %eax
+; X86-SLOW-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:    movl 44(%ebp), %eax
+; X86-SLOW-NEXT:  .LBB6_3:
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    testb $32, %cl
 ; X86-SLOW-NEXT:    je .LBB6_4
-; X86-SLOW-NEXT:  .LBB6_5:
-; X86-SLOW-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:  # %bb.5:
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    jmp .LBB6_6
+; X86-SLOW-NEXT:  .LBB6_4:
+; X86-SLOW-NEXT:    movl %edx, %esi
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %eax, %ebx
+; X86-SLOW-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-SLOW-NEXT:  .LBB6_6:
-; X86-SLOW-NEXT:    shrl %cl, %edx
-; X86-SLOW-NEXT:    movl %ecx, %ebx
-; X86-SLOW-NEXT:    notb %bl
-; X86-SLOW-NEXT:    leal (%ebp,%ebp), %eax
-; X86-SLOW-NEXT:    movl %ebx, %ecx
-; X86-SLOW-NEXT:    shll %cl, %eax
-; X86-SLOW-NEXT:    orl %edx, %eax
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    shrl %cl, %eax
+; X86-SLOW-NEXT:    movl %eax, %edx
+; X86-SLOW-NEXT:    movl %ecx, %eax
+; X86-SLOW-NEXT:    notb %al
+; X86-SLOW-NEXT:    movl %ebx, %edi
+; X86-SLOW-NEXT:    addl %ebx, %ebx
+; X86-SLOW-NEXT:    movl %eax, %ecx
+; X86-SLOW-NEXT:    shll %cl, %ebx
+; X86-SLOW-NEXT:    orl %edx, %ebx
+; X86-SLOW-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SLOW-NEXT:    shrl %cl, %ebp
-; X86-SLOW-NEXT:    leal (%edi,%edi), %edx
-; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    shrl %cl, %edi
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-SLOW-NEXT:    leal (%ebx,%ebx), %edx
+; X86-SLOW-NEXT:    movl %eax, %ecx
 ; X86-SLOW-NEXT:    shll %cl, %edx
-; X86-SLOW-NEXT:    orl %ebp, %edx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    orl %edi, %edx
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SLOW-NEXT:    shrl %cl, %edi
-; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X86-SLOW-NEXT:    leal (%edi,%edi), %ebp
-; X86-SLOW-NEXT:    movl %ebx, %ecx
-; X86-SLOW-NEXT:    shll %cl, %ebp
-; X86-SLOW-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    shrl %cl, %ebx
+; X86-SLOW-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-SLOW-NEXT:    leal (%edi,%edi), %ebx
+; X86-SLOW-NEXT:    movl %eax, %ecx
+; X86-SLOW-NEXT:    shll %cl, %ebx
+; X86-SLOW-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-SLOW-NEXT:    shrl %cl, %edi
 ; X86-SLOW-NEXT:    addl %esi, %esi
-; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    movl %eax, %ecx
 ; X86-SLOW-NEXT:    shll %cl, %esi
 ; X86-SLOW-NEXT:    orl %edi, %esi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    movl 8(%ebp), %ecx
 ; X86-SLOW-NEXT:    movl %esi, 12(%ecx)
-; X86-SLOW-NEXT:    movl %ebp, 8(%ecx)
+; X86-SLOW-NEXT:    movl %ebx, 8(%ecx)
 ; X86-SLOW-NEXT:    movl %edx, 4(%ecx)
+; X86-SLOW-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-SLOW-NEXT:    movl %eax, (%ecx)
 ; X86-SLOW-NEXT:    movl %ecx, %eax
-; X86-SLOW-NEXT:    addl $8, %esp
+; X86-SLOW-NEXT:    leal -12(%ebp), %esp
 ; X86-SLOW-NEXT:    popl %esi
 ; X86-SLOW-NEXT:    popl %edi
 ; X86-SLOW-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/function-align.ll b/llvm/test/CodeGen/X86/function-align.ll
new file mode 100644
index 0000000..11d0e99
--- /dev/null
+++ b/llvm/test/CodeGen/X86/function-align.ll
@@ -0,0 +1,18 @@
+; RUN: llc -function-sections < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK: .section .text.f1
+; CHECK-NOT: .p2align
+; CHECK: f1:
+define void @f1() align 1 {
+  ret void
+}
+
+; CHECK: .section .text.f2
+; CHECK-NEXT: .globl f2
+; CHECK-NEXT: .p2align 1
+define void @f2() align 2 {
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll
index a464d78..df97f49 100644
--- a/llvm/test/CodeGen/X86/funnel-shift.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift.ll
@@ -74,43 +74,57 @@ define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-SSE2-LABEL: fshl_i128:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
 ; X86-SSE2-NEXT:    pushl %ebx
 ; X86-SSE2-NEXT:    pushl %edi
 ; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movl 48(%ebp), %edi
+; X86-SSE2-NEXT:    movl 52(%ebp), %eax
+; X86-SSE2-NEXT:    movl 24(%ebp), %edx
+; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
 ; X86-SSE2-NEXT:    testb $64, %cl
-; X86-SSE2-NEXT:    movl %esi, %eax
-; X86-SSE2-NEXT:    cmovnel %ebx, %eax
-; X86-SSE2-NEXT:    movl %edx, %ebp
-; X86-SSE2-NEXT:    cmovnel %edi, %ebp
-; X86-SSE2-NEXT:    cmovnel {{[0-9]+}}(%esp), %edi
-; X86-SSE2-NEXT:    cmovnel {{[0-9]+}}(%esp), %ebx
-; X86-SSE2-NEXT:    cmovel {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT:    cmovel {{[0-9]+}}(%esp), %esi
+; X86-SSE2-NEXT:    movl %edx, %ecx
+; X86-SSE2-NEXT:    cmovnel %edi, %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 28(%ebp), %esi
+; X86-SSE2-NEXT:    movl %esi, %ebx
+; X86-SSE2-NEXT:    cmovnel %eax, %ebx
+; X86-SSE2-NEXT:    cmovnel 44(%ebp), %eax
+; X86-SSE2-NEXT:    cmovnel 40(%ebp), %edi
+; X86-SSE2-NEXT:    cmovel 36(%ebp), %esi
+; X86-SSE2-NEXT:    cmovel 32(%ebp), %edx
+; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
 ; X86-SSE2-NEXT:    testb $32, %cl
-; X86-SSE2-NEXT:    cmovnel %esi, %edx
-; X86-SSE2-NEXT:    cmovnel %ebp, %esi
-; X86-SSE2-NEXT:    cmovnel %eax, %ebp
-; X86-SSE2-NEXT:    cmovel %edi, %ebx
+; X86-SSE2-NEXT:    cmovnel %edx, %esi
+; X86-SSE2-NEXT:    cmovnel %ebx, %edx
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    cmovnel %ecx, %ebx
 ; X86-SSE2-NEXT:    cmovel %eax, %edi
-; X86-SSE2-NEXT:    movl %edi, %eax
-; X86-SSE2-NEXT:    shldl %cl, %ebx, %eax
-; X86-SSE2-NEXT:    movl %ebp, %ebx
-; X86-SSE2-NEXT:    shldl %cl, %edi, %ebx
-; X86-SSE2-NEXT:    movl %esi, %edi
-; X86-SSE2-NEXT:    shldl %cl, %ebp, %edi
+; X86-SSE2-NEXT:    cmovel %ecx, %eax
+; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
 ; X86-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SSE2-NEXT:    shldl %cl, %esi, %edx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movl %edx, 12(%ecx)
-; X86-SSE2-NEXT:    movl %edi, 8(%ecx)
-; X86-SSE2-NEXT:    movl %ebx, 4(%ecx)
-; X86-SSE2-NEXT:    movl %eax, (%ecx)
-; X86-SSE2-NEXT:    movl %ecx, %eax
+; X86-SSE2-NEXT:    shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-SSE2-NEXT:    movl %ebx, %edi
+; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
+; X86-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SSE2-NEXT:    shldl %cl, %eax, %edi
+; X86-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl %edx, %edi
+; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
+; X86-SSE2-NEXT:    shldl %cl, %ebx, %edi
+; X86-SSE2-NEXT:    movl 8(%ebp), %eax
+; X86-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SSE2-NEXT:    shldl %cl, %edx, %esi
+; X86-SSE2-NEXT:    movl %esi, 12(%eax)
+; X86-SSE2-NEXT:    movl %edi, 8(%eax)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl %ecx, (%eax)
+; X86-SSE2-NEXT:    leal -12(%ebp), %esp
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    popl %edi
 ; X86-SSE2-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/i128-add.ll b/llvm/test/CodeGen/X86/i128-add.ll
index 2849e44..b4546c1 100644
--- a/llvm/test/CodeGen/X86/i128-add.ll
+++ b/llvm/test/CodeGen/X86/i128-add.ll
@@ -5,17 +5,20 @@
 define i128 @add_i128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: add_i128:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    addl 40(%ebp), %esi
+; X86-NEXT:    adcl 44(%ebp), %edi
+; X86-NEXT:    adcl 48(%ebp), %ecx
+; X86-NEXT:    adcl 52(%ebp), %edx
 ; X86-NEXT:    addl $1, %esi
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %ecx
@@ -24,8 +27,10 @@ define i128 @add_i128(i128 %x, i128 %y) nounwind {
 ; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: add_i128:
diff --git a/llvm/test/CodeGen/X86/i128-fp128-abi.ll b/llvm/test/CodeGen/X86/i128-fp128-abi.ll
index 4152dcf..2174d50 100644
--- a/llvm/test/CodeGen/X86/i128-fp128-abi.ll
+++ b/llvm/test/CodeGen/X86/i128-fp128-abi.ll
@@ -55,41 +55,47 @@ define void @store(PrimTy %x, ptr %p) nounwind {
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl 12(%esp), %eax
-; CHECK-X86-NEXT:    movl 16(%esp), %ecx
-; CHECK-X86-NEXT:    movl 20(%esp), %edx
-; CHECK-X86-NEXT:    movl 24(%esp), %esi
-; CHECK-X86-NEXT:    movl 28(%esp), %edi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 20(%esp), %ecx
+; CHECK-X86-NEXT:    movl 24(%esp), %edx
+; CHECK-X86-NEXT:    movl 28(%esp), %esi
+; CHECK-X86-NEXT:    movl 32(%esp), %edi
 ; CHECK-X86-NEXT:    movl %esi, 12(%edi)
 ; CHECK-X86-NEXT:    movl %edx, 8(%edi)
 ; CHECK-X86-NEXT:    movl %ecx, 4(%edi)
 ; CHECK-X86-NEXT:    movl %eax, (%edi)
+; CHECK-X86-NEXT:    addl $4, %esp
 ; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: store:
 ; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl 12(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl 16(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl 20(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl 24(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl 28(%esp), %edi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 24(%ebp), %edi
 ; CHECK-MSVC32-NEXT:    movl %esi, 12(%edi)
 ; CHECK-MSVC32-NEXT:    movl %edx, 8(%edi)
 ; CHECK-MSVC32-NEXT:    movl %ecx, 4(%edi)
 ; CHECK-MSVC32-NEXT:    movl %eax, (%edi)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
 ; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   store PrimTy %x, ptr %p
   ret void
 }
 
 ; Illustrate stack alignment
-; FIXME(#77401): alignment on x86-32 is ABI-incorrect.
 define void @store_perturbed(i8 %_0, PrimTy %x, ptr %p) nounwind {
 ; CHECK-X64-F128-LABEL: store_perturbed:
 ; CHECK-X64-F128:       # %bb.0:
@@ -130,34 +136,41 @@ define void @store_perturbed(i8 %_0, PrimTy %x, ptr %p) nounwind {
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl 16(%esp), %eax
-; CHECK-X86-NEXT:    movl 20(%esp), %ecx
-; CHECK-X86-NEXT:    movl 24(%esp), %edx
-; CHECK-X86-NEXT:    movl 28(%esp), %esi
-; CHECK-X86-NEXT:    movl 32(%esp), %edi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 32(%esp), %eax
+; CHECK-X86-NEXT:    movl 36(%esp), %ecx
+; CHECK-X86-NEXT:    movl 40(%esp), %edx
+; CHECK-X86-NEXT:    movl 44(%esp), %esi
+; CHECK-X86-NEXT:    movl 48(%esp), %edi
 ; CHECK-X86-NEXT:    movl %esi, 12(%edi)
 ; CHECK-X86-NEXT:    movl %edx, 8(%edi)
 ; CHECK-X86-NEXT:    movl %ecx, 4(%edi)
 ; CHECK-X86-NEXT:    movl %eax, (%edi)
+; CHECK-X86-NEXT:    addl $4, %esp
 ; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: store_perturbed:
 ; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl 16(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl 20(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl 24(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl 28(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl 32(%esp), %edi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 24(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 28(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 32(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 36(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 40(%ebp), %edi
 ; CHECK-MSVC32-NEXT:    movl %esi, 12(%edi)
 ; CHECK-MSVC32-NEXT:    movl %edx, 8(%edi)
 ; CHECK-MSVC32-NEXT:    movl %ecx, 4(%edi)
 ; CHECK-MSVC32-NEXT:    movl %eax, (%edi)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
 ; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   store PrimTy %x, ptr %p
   ret void
@@ -271,34 +284,41 @@ define PrimTy @first_arg(PrimTy %x) nounwind {
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl 12(%esp), %eax
-; CHECK-X86-NEXT:    movl 16(%esp), %ecx
-; CHECK-X86-NEXT:    movl 20(%esp), %edx
-; CHECK-X86-NEXT:    movl 24(%esp), %esi
-; CHECK-X86-NEXT:    movl 28(%esp), %edi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 32(%esp), %ecx
+; CHECK-X86-NEXT:    movl 36(%esp), %edx
+; CHECK-X86-NEXT:    movl 40(%esp), %esi
+; CHECK-X86-NEXT:    movl 44(%esp), %edi
 ; CHECK-X86-NEXT:    movl %edi, 12(%eax)
 ; CHECK-X86-NEXT:    movl %esi, 8(%eax)
 ; CHECK-X86-NEXT:    movl %edx, 4(%eax)
 ; CHECK-X86-NEXT:    movl %ecx, (%eax)
+; CHECK-X86-NEXT:    addl $4, %esp
 ; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl $4
 ;
 ; CHECK-MSVC32-LABEL: first_arg:
 ; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl 12(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl 16(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl 20(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl 24(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl 28(%esp), %edi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 24(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 28(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 32(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 36(%ebp), %edi
 ; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
 ; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
 ; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
 ; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   ret PrimTy %x
 }
@@ -344,34 +364,41 @@ define PrimTy @leading_args(i64 %_0, i64 %_1, i64 %_2, i64 %_3, PrimTy %x) nounw
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl 12(%esp), %eax
-; CHECK-X86-NEXT:    movl 48(%esp), %ecx
-; CHECK-X86-NEXT:    movl 52(%esp), %edx
-; CHECK-X86-NEXT:    movl 56(%esp), %esi
-; CHECK-X86-NEXT:    movl 60(%esp), %edi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 64(%esp), %ecx
+; CHECK-X86-NEXT:    movl 68(%esp), %edx
+; CHECK-X86-NEXT:    movl 72(%esp), %esi
+; CHECK-X86-NEXT:    movl 76(%esp), %edi
 ; CHECK-X86-NEXT:    movl %edi, 12(%eax)
 ; CHECK-X86-NEXT:    movl %esi, 8(%eax)
 ; CHECK-X86-NEXT:    movl %edx, 4(%eax)
 ; CHECK-X86-NEXT:    movl %ecx, (%eax)
+; CHECK-X86-NEXT:    addl $4, %esp
 ; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl $4
 ;
 ; CHECK-MSVC32-LABEL: leading_args:
 ; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl 12(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl 48(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl 52(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl 56(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl 60(%esp), %edi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 56(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 60(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 64(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 68(%ebp), %edi
 ; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
 ; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
 ; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
 ; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   ret PrimTy %x
 }
@@ -417,34 +444,41 @@ define PrimTy @many_leading_args(i64 %_0, i64 %_1, i64 %_2, i64 %_3, i64 %_4, Pr
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl 12(%esp), %eax
-; CHECK-X86-NEXT:    movl 72(%esp), %ecx
-; CHECK-X86-NEXT:    movl 76(%esp), %edx
-; CHECK-X86-NEXT:    movl 80(%esp), %esi
-; CHECK-X86-NEXT:    movl 84(%esp), %edi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 80(%esp), %ecx
+; CHECK-X86-NEXT:    movl 84(%esp), %edx
+; CHECK-X86-NEXT:    movl 88(%esp), %esi
+; CHECK-X86-NEXT:    movl 92(%esp), %edi
 ; CHECK-X86-NEXT:    movl %edi, 12(%eax)
 ; CHECK-X86-NEXT:    movl %esi, 8(%eax)
 ; CHECK-X86-NEXT:    movl %edx, 4(%eax)
 ; CHECK-X86-NEXT:    movl %ecx, (%eax)
+; CHECK-X86-NEXT:    addl $4, %esp
 ; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl $4
 ;
 ; CHECK-MSVC32-LABEL: many_leading_args:
 ; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl 12(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl 72(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl 76(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl 80(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl 84(%esp), %edi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 72(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 76(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 80(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 84(%ebp), %edi
 ; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
 ; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
 ; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
 ; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   ret PrimTy %x
 }
@@ -488,34 +522,41 @@ define PrimTy @trailing_arg(i64 %_0, i64 %_1, i64 %_2, i64 %_3, i64 %_4, PrimTy
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl 12(%esp), %eax
-; CHECK-X86-NEXT:    movl 56(%esp), %ecx
-; CHECK-X86-NEXT:    movl 60(%esp), %edx
-; CHECK-X86-NEXT:    movl 64(%esp), %esi
-; CHECK-X86-NEXT:    movl 68(%esp), %edi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 64(%esp), %ecx
+; CHECK-X86-NEXT:    movl 68(%esp), %edx
+; CHECK-X86-NEXT:    movl 72(%esp), %esi
+; CHECK-X86-NEXT:    movl 76(%esp), %edi
 ; CHECK-X86-NEXT:    movl %edi, 12(%eax)
 ; CHECK-X86-NEXT:    movl %esi, 8(%eax)
 ; CHECK-X86-NEXT:    movl %edx, 4(%eax)
 ; CHECK-X86-NEXT:    movl %ecx, (%eax)
+; CHECK-X86-NEXT:    addl $4, %esp
 ; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl $4
 ;
 ; CHECK-MSVC32-LABEL: trailing_arg:
 ; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl 12(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl 56(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl 60(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl 64(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl 68(%esp), %edi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 56(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 60(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 64(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 68(%ebp), %edi
 ; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
 ; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
 ; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
 ; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   ret PrimTy %x
 }
@@ -571,32 +612,43 @@ define void @call_first_arg(PrimTy %x) nounwind {
 ;
 ; CHECK-X86-LABEL: call_first_arg:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    subl $40, %esp
-; CHECK-X86-NEXT:    leal 12(%esp), %eax
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    pushl %esi
+; CHECK-X86-NEXT:    subl $56, %esp
+; CHECK-X86-NEXT:    movl 64(%esp), %eax
+; CHECK-X86-NEXT:    movl 68(%esp), %ecx
+; CHECK-X86-NEXT:    movl 72(%esp), %edx
+; CHECK-X86-NEXT:    movl 76(%esp), %esi
+; CHECK-X86-NEXT:    movl %esi, 28(%esp)
+; CHECK-X86-NEXT:    movl %edx, 24(%esp)
+; CHECK-X86-NEXT:    movl %ecx, 20(%esp)
+; CHECK-X86-NEXT:    movl %eax, 16(%esp)
+; CHECK-X86-NEXT:    leal 32(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, (%esp)
 ; CHECK-X86-NEXT:    calll first_arg@PLT
-; CHECK-X86-NEXT:    addl $56, %esp
+; CHECK-X86-NEXT:    addl $52, %esp
+; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: call_first_arg:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
+; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
-; CHECK-MSVC32-NEXT:    subl $32, %esp
-; CHECK-MSVC32-NEXT:    movl %esp, %eax
-; CHECK-MSVC32-NEXT:    pushl 20(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 16(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 12(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 8(%ebp)
-; CHECK-MSVC32-NEXT:    pushl %eax
+; CHECK-MSVC32-NEXT:    subl $64, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl %esi, 28(%esp)
+; CHECK-MSVC32-NEXT:    movl %edx, 24(%esp)
+; CHECK-MSVC32-NEXT:    movl %ecx, 20(%esp)
+; CHECK-MSVC32-NEXT:    movl %eax, 16(%esp)
+; CHECK-MSVC32-NEXT:    leal 32(%esp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, (%esp)
 ; CHECK-MSVC32-NEXT:    calll _first_arg
-; CHECK-MSVC32-NEXT:    addl $20, %esp
-; CHECK-MSVC32-NEXT:    movl %ebp, %esp
+; CHECK-MSVC32-NEXT:    leal -4(%ebp), %esp
+; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   call PrimTy @first_arg(PrimTy %x)
@@ -686,48 +738,59 @@ define void @call_leading_args(PrimTy %x) nounwind {
 ;
 ; CHECK-X86-LABEL: call_leading_args:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    subl $40, %esp
-; CHECK-X86-NEXT:    leal 12(%esp), %eax
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    pushl %esi
+; CHECK-X86-NEXT:    subl $88, %esp
+; CHECK-X86-NEXT:    movl 96(%esp), %eax
+; CHECK-X86-NEXT:    movl 100(%esp), %ecx
+; CHECK-X86-NEXT:    movl 104(%esp), %edx
+; CHECK-X86-NEXT:    movl 108(%esp), %esi
+; CHECK-X86-NEXT:    movl %esi, 60(%esp)
+; CHECK-X86-NEXT:    movl %edx, 56(%esp)
+; CHECK-X86-NEXT:    movl %ecx, 52(%esp)
+; CHECK-X86-NEXT:    movl %eax, 48(%esp)
+; CHECK-X86-NEXT:    leal 64(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, (%esp)
+; CHECK-X86-NEXT:    movl $0, 32(%esp)
+; CHECK-X86-NEXT:    movl $0, 28(%esp)
+; CHECK-X86-NEXT:    movl $0, 24(%esp)
+; CHECK-X86-NEXT:    movl $0, 20(%esp)
+; CHECK-X86-NEXT:    movl $0, 16(%esp)
+; CHECK-X86-NEXT:    movl $0, 12(%esp)
+; CHECK-X86-NEXT:    movl $0, 8(%esp)
+; CHECK-X86-NEXT:    movl $0, 4(%esp)
 ; CHECK-X86-NEXT:    calll leading_args@PLT
-; CHECK-X86-NEXT:    addl $88, %esp
+; CHECK-X86-NEXT:    addl $84, %esp
+; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: call_leading_args:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
+; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
-; CHECK-MSVC32-NEXT:    subl $32, %esp
-; CHECK-MSVC32-NEXT:    movl %esp, %eax
-; CHECK-MSVC32-NEXT:    pushl 20(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 16(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 12(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 8(%ebp)
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl %eax
+; CHECK-MSVC32-NEXT:    subl $96, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl %esi, 60(%esp)
+; CHECK-MSVC32-NEXT:    movl %edx, 56(%esp)
+; CHECK-MSVC32-NEXT:    movl %ecx, 52(%esp)
+; CHECK-MSVC32-NEXT:    movl %eax, 48(%esp)
+; CHECK-MSVC32-NEXT:    leal 64(%esp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, (%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 32(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 28(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 24(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 20(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 16(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 12(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 8(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 4(%esp)
 ; CHECK-MSVC32-NEXT:    calll _leading_args
-; CHECK-MSVC32-NEXT:    addl $52, %esp
-; CHECK-MSVC32-NEXT:    movl %ebp, %esp
+; CHECK-MSVC32-NEXT:    leal -4(%ebp), %esp
+; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   call PrimTy @leading_args(i64 0, i64 0, i64 0, i64 0, PrimTy %x)
@@ -836,56 +899,67 @@ define void @call_many_leading_args(PrimTy %x) nounwind {
 ;
 ; CHECK-X86-LABEL: call_many_leading_args:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    subl $40, %esp
-; CHECK-X86-NEXT:    leal 12(%esp), %eax
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    pushl %esi
+; CHECK-X86-NEXT:    subl $104, %esp
+; CHECK-X86-NEXT:    movl 112(%esp), %eax
+; CHECK-X86-NEXT:    movl 116(%esp), %ecx
+; CHECK-X86-NEXT:    movl 120(%esp), %edx
+; CHECK-X86-NEXT:    movl 124(%esp), %esi
+; CHECK-X86-NEXT:    movl %esi, 76(%esp)
+; CHECK-X86-NEXT:    movl %edx, 72(%esp)
+; CHECK-X86-NEXT:    movl %ecx, 68(%esp)
+; CHECK-X86-NEXT:    movl %eax, 64(%esp)
+; CHECK-X86-NEXT:    leal 80(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, (%esp)
+; CHECK-X86-NEXT:    movl $0, 60(%esp)
+; CHECK-X86-NEXT:    movl $0, 56(%esp)
+; CHECK-X86-NEXT:    movl $0, 52(%esp)
+; CHECK-X86-NEXT:    movl $0, 48(%esp)
+; CHECK-X86-NEXT:    movl $0, 32(%esp)
+; CHECK-X86-NEXT:    movl $0, 28(%esp)
+; CHECK-X86-NEXT:    movl $0, 24(%esp)
+; CHECK-X86-NEXT:    movl $0, 20(%esp)
+; CHECK-X86-NEXT:    movl $0, 16(%esp)
+; CHECK-X86-NEXT:    movl $0, 12(%esp)
+; CHECK-X86-NEXT:    movl $0, 8(%esp)
+; CHECK-X86-NEXT:    movl $0, 4(%esp)
 ; CHECK-X86-NEXT:    calll many_leading_args@PLT
-; CHECK-X86-NEXT:    addl $104, %esp
+; CHECK-X86-NEXT:    addl $100, %esp
+; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: call_many_leading_args:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
+; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
-; CHECK-MSVC32-NEXT:    subl $32, %esp
-; CHECK-MSVC32-NEXT:    movl %esp, %eax
-; CHECK-MSVC32-NEXT:    pushl 20(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 16(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 12(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 8(%ebp)
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl %eax
+; CHECK-MSVC32-NEXT:    subl $112, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl %esi, 76(%esp)
+; CHECK-MSVC32-NEXT:    movl %edx, 72(%esp)
+; CHECK-MSVC32-NEXT:    movl %ecx, 68(%esp)
+; CHECK-MSVC32-NEXT:    movl %eax, 64(%esp)
+; CHECK-MSVC32-NEXT:    leal 80(%esp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, (%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 60(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 56(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 52(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 48(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 32(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 28(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 24(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 20(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 16(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 12(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 8(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 4(%esp)
 ; CHECK-MSVC32-NEXT:    calll _many_leading_args
-; CHECK-MSVC32-NEXT:    addl $68, %esp
-; CHECK-MSVC32-NEXT:    movl %ebp, %esp
+; CHECK-MSVC32-NEXT:    leal -4(%ebp), %esp
+; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   call PrimTy @many_leading_args(i64 0, i64 0, i64 0, i64 0, PrimTy Prim0, PrimTy %x)
@@ -975,48 +1049,59 @@ define void @call_trailing_arg(PrimTy %x) nounwind {
 ;
 ; CHECK-X86-LABEL: call_trailing_arg:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    subl $40, %esp
-; CHECK-X86-NEXT:    leal 12(%esp), %eax
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    pushl %esi
+; CHECK-X86-NEXT:    subl $88, %esp
+; CHECK-X86-NEXT:    movl 96(%esp), %eax
+; CHECK-X86-NEXT:    movl 100(%esp), %ecx
+; CHECK-X86-NEXT:    movl 104(%esp), %edx
+; CHECK-X86-NEXT:    movl 108(%esp), %esi
+; CHECK-X86-NEXT:    movl %esi, 60(%esp)
+; CHECK-X86-NEXT:    movl %edx, 56(%esp)
+; CHECK-X86-NEXT:    movl %ecx, 52(%esp)
+; CHECK-X86-NEXT:    movl %eax, 48(%esp)
+; CHECK-X86-NEXT:    leal 64(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, (%esp)
+; CHECK-X86-NEXT:    movl $0, 32(%esp)
+; CHECK-X86-NEXT:    movl $0, 28(%esp)
+; CHECK-X86-NEXT:    movl $0, 24(%esp)
+; CHECK-X86-NEXT:    movl $0, 20(%esp)
+; CHECK-X86-NEXT:    movl $0, 16(%esp)
+; CHECK-X86-NEXT:    movl $0, 12(%esp)
+; CHECK-X86-NEXT:    movl $0, 8(%esp)
+; CHECK-X86-NEXT:    movl $0, 4(%esp)
 ; CHECK-X86-NEXT:    calll trailing_arg@PLT
-; CHECK-X86-NEXT:    addl $88, %esp
+; CHECK-X86-NEXT:    addl $84, %esp
+; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: call_trailing_arg:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
+; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
-; CHECK-MSVC32-NEXT:    subl $32, %esp
-; CHECK-MSVC32-NEXT:    movl %esp, %eax
-; CHECK-MSVC32-NEXT:    pushl 20(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 16(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 12(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 8(%ebp)
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl %eax
+; CHECK-MSVC32-NEXT:    subl $96, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl %esi, 60(%esp)
+; CHECK-MSVC32-NEXT:    movl %edx, 56(%esp)
+; CHECK-MSVC32-NEXT:    movl %ecx, 52(%esp)
+; CHECK-MSVC32-NEXT:    movl %eax, 48(%esp)
+; CHECK-MSVC32-NEXT:    leal 64(%esp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, (%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 32(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 28(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 24(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 20(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 16(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 12(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 8(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 4(%esp)
 ; CHECK-MSVC32-NEXT:    calll _trailing_arg
-; CHECK-MSVC32-NEXT:    addl $52, %esp
-; CHECK-MSVC32-NEXT:    movl %ebp, %esp
+; CHECK-MSVC32-NEXT:    leal -4(%ebp), %esp
+; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   call PrimTy @trailing_arg(i64 0, i64 0, i64 0, i64 0, PrimTy %x)
diff --git a/llvm/test/CodeGen/X86/i128-sdiv.ll b/llvm/test/CodeGen/X86/i128-sdiv.ll
index 717f52f..7d57573 100644
--- a/llvm/test/CodeGen/X86/i128-sdiv.ll
+++ b/llvm/test/CodeGen/X86/i128-sdiv.ll
@@ -8,18 +8,21 @@
 define i128 @test1(i128 %x) nounwind {
 ; X86-LABEL: test1:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 36(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    sarl $31, %esi
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl $30, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 24(%ebp), %edi
 ; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    adcl 28(%ebp), %esi
+; X86-NEXT:    adcl 32(%ebp), %edx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    shrdl $2, %ecx, %edx
 ; X86-NEXT:    movl %ecx, %esi
@@ -29,8 +32,10 @@ define i128 @test1(i128 %x) nounwind {
 ; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test1:
@@ -52,38 +57,44 @@ define i128 @test1(i128 %x) nounwind {
 define i128 @test2(i128 %x) nounwind {
 ; X86-LABEL: test2:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    shrl $30, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    shrdl $2, %edx, %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    adcl 28(%ebp), %edx
+; X86-NEXT:    adcl 32(%ebp), %ecx
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    shrdl $2, %eax, %ecx
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    sarl $2, %edx
-; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    sarl $2, %eax
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    negl %ecx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %edx, %ebx
-; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %esi, %ebx
 ; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebx, 4(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %ebx, 8(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test2:
diff --git a/llvm/test/CodeGen/X86/i128-udiv.ll b/llvm/test/CodeGen/X86/i128-udiv.ll
index 3f890b7..9011832 100644
--- a/llvm/test/CodeGen/X86/i128-udiv.ll
+++ b/llvm/test/CodeGen/X86/i128-udiv.ll
@@ -8,15 +8,21 @@
 define i128 @test1(i128 %x) nounwind {
 ; X86-LABEL: test1:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %edx
 ; X86-NEXT:    shrdl $2, %edx, %ecx
 ; X86-NEXT:    shrl $2, %edx
 ; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl $0, 12(%eax)
 ; X86-NEXT:    movl $0, 8(%eax)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test1:
diff --git a/llvm/test/CodeGen/X86/iabs.ll b/llvm/test/CodeGen/X86/iabs.ll
index 55c318e..bdceeef 100644
--- a/llvm/test/CodeGen/X86/iabs.ll
+++ b/llvm/test/CodeGen/X86/iabs.ll
@@ -123,31 +123,34 @@ define i64 @test_i64(i64 %a) nounwind {
 define i128 @test_i128(i128 %a) nounwind {
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    subl %edx, %ebx
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl %ebx, (%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test_i128:
diff --git a/llvm/test/CodeGen/X86/icmp-shift-opt.ll b/llvm/test/CodeGen/X86/icmp-shift-opt.ll
index c52b3ed..4a6c1d0 100644
--- a/llvm/test/CodeGen/X86/icmp-shift-opt.ll
+++ b/llvm/test/CodeGen/X86/icmp-shift-opt.ll
@@ -10,33 +10,39 @@ define i128 @opt_setcc_lt_power_of_2(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_lt_power_of_2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    movl 24(%ebp), %esi
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB0_1: # %loop
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    addl $1, %edi
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    addl $1, %esi
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    orl %ecx, %ebx
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    orl %edx, %ebp
-; X86-NEXT:    orl %ecx, %ebp
-; X86-NEXT:    shrdl $28, %ebx, %ebp
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    shrdl $28, %ebx, %esi
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    jne .LBB0_1
 ; X86-NEXT:  # %bb.2: # %exit
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -73,15 +79,21 @@ exit:
 define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_srl_eq_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    orl 20(%ebp), %ecx
+; X86-NEXT:    movl 8(%ebp), %edx
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    shldl $15, %edx, %ecx
 ; X86-NEXT:    sete %al
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_srl_eq_zero:
@@ -98,15 +110,21 @@ define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind {
 define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_srl_ne_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    orl 20(%ebp), %ecx
+; X86-NEXT:    movl 8(%ebp), %edx
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    shldl $15, %edx, %ecx
 ; X86-NEXT:    setne %al
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_srl_ne_zero:
@@ -123,13 +141,19 @@ define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind {
 define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_shl_eq_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 20(%ebp), %ecx
 ; X86-NEXT:    shll $17, %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl 8(%ebp), %eax
+; X86-NEXT:    orl 12(%ebp), %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    sete %al
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_shl_eq_zero:
@@ -146,13 +170,19 @@ define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind {
 define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_shl_ne_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 20(%ebp), %ecx
 ; X86-NEXT:    shll $17, %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl 8(%ebp), %eax
+; X86-NEXT:    orl 12(%ebp), %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    setne %al
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_shl_ne_zero:
@@ -170,13 +200,17 @@ define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind {
 define i1 @opt_setcc_shl_eq_zero_multiple_shl_users(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_shl_eq_zero_multiple_shl_users:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl 16(%ebp), %edx
+; X86-NEXT:    movl 20(%ebp), %esi
 ; X86-NEXT:    shldl $17, %edx, %esi
 ; X86-NEXT:    shldl $17, %ecx, %edx
 ; X86-NEXT:    shldl $17, %eax, %ecx
@@ -194,9 +228,11 @@ define i1 @opt_setcc_shl_eq_zero_multiple_shl_users(i128 %a) nounwind {
 ; X86-NEXT:    calll use@PLT
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_shl_eq_zero_multiple_shl_users:
diff --git a/llvm/test/CodeGen/X86/invalid-operand-bundle-call.ll b/llvm/test/CodeGen/X86/invalid-operand-bundle-call.ll
index ac4963f..17065a4 100644
--- a/llvm/test/CodeGen/X86/invalid-operand-bundle-call.ll
+++ b/llvm/test/CodeGen/X86/invalid-operand-bundle-call.ll
@@ -1,10 +1,10 @@
 ; RUN: not llc -mtriple=x86_64-unknown-linux-gnu < %s 2>&1 | FileCheck %s
 
-; CHECK: LLVM ERROR: cannot lower calls with arbitrary operand bundles: foo
+; CHECK: LLVM ERROR: cannot lower calls with arbitrary operand bundles: foo, bar, baz
 
 declare void @g()
 
 define void @f(i32 %arg) {
-  call void @g() [ "foo"(i32 %arg) ]
+  call void @g() [ "foo"(i32 %arg), "bar"(i32 %arg), "baz"(i32 %arg) ]
   ret void
 }
diff --git a/llvm/test/CodeGen/X86/isel-fpclass.ll b/llvm/test/CodeGen/X86/isel-fpclass.ll
new file mode 100644
index 0000000..960bbf5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-fpclass.ll
@@ -0,0 +1,526 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefixes=X86-SDAGISEL
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefixes=X64,X64-SDAGISEL
+; RUN: llc < %s -mtriple=i686-linux -fast-isel -fast-isel-abort=1  | FileCheck %s -check-prefixes=X86-FASTISEL
+; RUN: llc < %s -mtriple=x86_64-linux -fast-isel -fast-isel-abort=1  | FileCheck %s -check-prefixes=X64,X64-FASTISEL
+
+; FIXME: We can reuse/delete llvm/test/CodeGen/X86/is_fpclass.ll when all patches are included.
+
+define i1 @isnone_f(float %x) {
+; X86-SDAGISEL-LABEL: isnone_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    xorl %eax, %eax
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: isnone_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: isnone_f:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstp %st(0)
+; X86-FASTISEL-NEXT:    xorl %eax, %eax
+; X86-FASTISEL-NEXT:    retl
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 0)
+  ret i1 %0
+}
+
+define i1 @isany_f(float %x) {
+; X86-SDAGISEL-LABEL: isany_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    movb $1, %al
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: isany_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movb $1, %al
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: isany_f:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstp %st(0)
+; X86-FASTISEL-NEXT:    movb $1, %al
+; X86-FASTISEL-NEXT:    retl
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1023)
+  ret i1 %0
+}
+
+define i1 @issignaling_f(float %x) {
+; X86-SDAGISEL-LABEL: issignaling_f:
+; X86-SDAGISEL:       # %bb.0:
+; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-SDAGISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-SDAGISEL-NEXT:    setl %cl
+; X86-SDAGISEL-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-SDAGISEL-NEXT:    setge %al
+; X86-SDAGISEL-NEXT:    andb %cl, %al
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: issignaling_f:
+; X64:       # %bb.0:
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-NEXT:    setl %cl
+; X64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X64-NEXT:    setge %al
+; X64-NEXT:    andb %cl, %al
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: issignaling_f:
+; X86-FASTISEL:       # %bb.0:
+; X86-FASTISEL-NEXT:    pushl %eax
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
+; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstps (%esp)
+; X86-FASTISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-FASTISEL-NEXT:    andl (%esp), %eax
+; X86-FASTISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-FASTISEL-NEXT:    setl %cl
+; X86-FASTISEL-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-FASTISEL-NEXT:    setge %al
+; X86-FASTISEL-NEXT:    andb %cl, %al
+; X86-FASTISEL-NEXT:    popl %ecx
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
+; X86-FASTISEL-NEXT:    retl
+   %a0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1)  ; "snan"
+   ret i1 %a0
+}
+
+ define i1 @isquiet_f(float %x) {
+; X86-SDAGISEL-LABEL: isquiet_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-SDAGISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-SDAGISEL-NEXT:    setge %al
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: isquiet_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-NEXT:    setge %al
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: isquiet_f:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    pushl %eax
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
+; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstps (%esp)
+; X86-FASTISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-FASTISEL-NEXT:    andl (%esp), %eax
+; X86-FASTISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-FASTISEL-NEXT:    setge %al
+; X86-FASTISEL-NEXT:    popl %ecx
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
+; X86-FASTISEL-NEXT:    retl
+ entry:
+   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 2)  ; "qnan"
+   ret i1 %0
+}
+
+define i1 @not_isquiet_f(float %x) {
+; X86-SDAGISEL-LABEL: not_isquiet_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-SDAGISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-SDAGISEL-NEXT:    setl %al
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: not_isquiet_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-NEXT:    setl %al
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: not_isquiet_f:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    pushl %eax
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
+; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstps (%esp)
+; X86-FASTISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-FASTISEL-NEXT:    andl (%esp), %eax
+; X86-FASTISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-FASTISEL-NEXT:    setl %al
+; X86-FASTISEL-NEXT:    popl %ecx
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
+; X86-FASTISEL-NEXT:    retl
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1021)  ; ~"qnan"
+  ret i1 %0
+}
+
+define i1 @isinf_f(float %x) {
+; X86-SDAGISEL-LABEL: isinf_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-SDAGISEL-NEXT:    sete %al
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: isinf_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: isinf_f:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    pushl %eax
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
+; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstps (%esp)
+; X86-FASTISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-FASTISEL-NEXT:    andl (%esp), %eax
+; X86-FASTISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-FASTISEL-NEXT:    sete %al
+; X86-FASTISEL-NEXT:    popl %ecx
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
+; X86-FASTISEL-NEXT:    retl
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 516)  ; 0x204 = "inf"
+  ret i1 %0
+}
+
+define i1 @not_isinf_f(float %x) {
+; X86-SDAGISEL-LABEL: not_isinf_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-SDAGISEL-NEXT:    setne %al
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: not_isinf_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: not_isinf_f:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    pushl %eax
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
+; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstps (%esp)
+; X86-FASTISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-FASTISEL-NEXT:    andl (%esp), %eax
+; X86-FASTISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-FASTISEL-NEXT:    setne %al
+; X86-FASTISEL-NEXT:    popl %ecx
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
+; X86-FASTISEL-NEXT:    retl
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 507)  ; ~0x204 = "~inf"
+  ret i1 %0
+}
+
+define i1 @is_plus_inf_f(float %x) {
+; X86-SDAGISEL-LABEL: is_plus_inf_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-SDAGISEL-NEXT:    sete %al
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: is_plus_inf_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: is_plus_inf_f:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    pushl %eax
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
+; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstps (%esp)
+; X86-FASTISEL-NEXT:    cmpl $2139095040, (%esp) # imm = 0x7F800000
+; X86-FASTISEL-NEXT:    sete %al
+; X86-FASTISEL-NEXT:    popl %ecx
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
+; X86-FASTISEL-NEXT:    retl
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 512)  ; 0x200 = "+inf"
+  ret i1 %0
+}
+
+define i1 @is_minus_inf_f(float %x) {
+; X86-SDAGISEL-LABEL: is_minus_inf_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
+; X86-SDAGISEL-NEXT:    sete %al
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: is_minus_inf_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: is_minus_inf_f:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    pushl %eax
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
+; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstps (%esp)
+; X86-FASTISEL-NEXT:    cmpl $-8388608, (%esp) # imm = 0xFF800000
+; X86-FASTISEL-NEXT:    sete %al
+; X86-FASTISEL-NEXT:    popl %ecx
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
+; X86-FASTISEL-NEXT:    retl
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 4)  ; "-inf"
+  ret i1 %0
+}
+
+define i1 @not_is_minus_inf_f(float %x) {
+; X86-SDAGISEL-LABEL: not_is_minus_inf_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
+; X86-SDAGISEL-NEXT:    setne %al
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: not_is_minus_inf_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: not_is_minus_inf_f:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    pushl %eax
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
+; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstps (%esp)
+; X86-FASTISEL-NEXT:    cmpl $-8388608, (%esp) # imm = 0xFF800000
+; X86-FASTISEL-NEXT:    setne %al
+; X86-FASTISEL-NEXT:    popl %ecx
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
+; X86-FASTISEL-NEXT:    retl
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1019)  ; ~"-inf"
+  ret i1 %0
+}
+
+define i1 @isfinite_f(float %x) {
+; X86-SDAGISEL-LABEL: isfinite_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-SDAGISEL-NEXT:    setl %al
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: isfinite_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    setl %al
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: isfinite_f:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    pushl %eax
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
+; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstps (%esp)
+; X86-FASTISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-FASTISEL-NEXT:    andl (%esp), %eax
+; X86-FASTISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-FASTISEL-NEXT:    setl %al
+; X86-FASTISEL-NEXT:    popl %ecx
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
+; X86-FASTISEL-NEXT:    retl
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 504)  ; 0x1f8 = "finite"
+  ret i1 %0
+}
+
+define i1 @not_isfinite_f(float %x) {
+; X86-SDAGISEL-LABEL: not_isfinite_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-SDAGISEL-NEXT:    setge %al
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: not_isfinite_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    setge %al
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: not_isfinite_f:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    pushl %eax
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
+; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstps (%esp)
+; X86-FASTISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-FASTISEL-NEXT:    andl (%esp), %eax
+; X86-FASTISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-FASTISEL-NEXT:    setge %al
+; X86-FASTISEL-NEXT:    popl %ecx
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
+; X86-FASTISEL-NEXT:    retl
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 519)  ; ~0x1f8 = "~finite"
+  ret i1 %0
+}
+
+define i1 @is_plus_finite_f(float %x) {
+; X86-SDAGISEL-LABEL: is_plus_finite_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-SDAGISEL-NEXT:    setb %al
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: is_plus_finite_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: is_plus_finite_f:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    pushl %eax
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
+; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstps (%esp)
+; X86-FASTISEL-NEXT:    cmpl $2139095040, (%esp) # imm = 0x7F800000
+; X86-FASTISEL-NEXT:    setb %al
+; X86-FASTISEL-NEXT:    popl %ecx
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
+; X86-FASTISEL-NEXT:    retl
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 448)  ; 0x1c0 = "+finite"
+  ret i1 %0
+}
+
+define i1 @isnone_d(double %x) nounwind {
+; X86-SDAGISEL-LABEL: isnone_d:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    xorl %eax, %eax
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: isnone_d:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: isnone_d:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstp %st(0)
+; X86-FASTISEL-NEXT:    xorl %eax, %eax
+; X86-FASTISEL-NEXT:    retl
+entry:
+    %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 0)
+    ret i1 %0
+}
+
+define i1 @isany_d(double %x) nounwind {
+; X86-SDAGISEL-LABEL: isany_d:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    movb $1, %al
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: isany_d:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movb $1, %al
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: isany_d:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstp %st(0)
+; X86-FASTISEL-NEXT:    movb $1, %al
+; X86-FASTISEL-NEXT:    retl
+entry:
+    %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 1023)
+    ret i1 %0
+}
+
+define i1 @isnone_f80(x86_fp80 %x) nounwind {
+; X86-SDAGISEL-LABEL: isnone_f80:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    xorl %eax, %eax
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-SDAGISEL-LABEL: isnone_f80:
+; X64-SDAGISEL:       # %bb.0: # %entry
+; X64-SDAGISEL-NEXT:    xorl %eax, %eax
+; X64-SDAGISEL-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: isnone_f80:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstp %st(0)
+; X86-FASTISEL-NEXT:    xorl %eax, %eax
+; X86-FASTISEL-NEXT:    retl
+;
+; X64-FASTISEL-LABEL: isnone_f80:
+; X64-FASTISEL:       # %bb.0: # %entry
+; X64-FASTISEL-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-FASTISEL-NEXT:    fstp %st(0)
+; X64-FASTISEL-NEXT:    xorl %eax, %eax
+; X64-FASTISEL-NEXT:    retq
+entry:
+%0 = tail call i1 @llvm.is.fpclass.f80(x86_fp80 %x, i32 0)
+ret i1 %0
+}
+
+define i1 @isany_f80(x86_fp80 %x) nounwind {
+; X86-SDAGISEL-LABEL: isany_f80:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    movb $1, %al
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-SDAGISEL-LABEL: isany_f80:
+; X64-SDAGISEL:       # %bb.0: # %entry
+; X64-SDAGISEL-NEXT:    movb $1, %al
+; X64-SDAGISEL-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: isany_f80:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstp %st(0)
+; X86-FASTISEL-NEXT:    movb $1, %al
+; X86-FASTISEL-NEXT:    retl
+;
+; X64-FASTISEL-LABEL: isany_f80:
+; X64-FASTISEL:       # %bb.0: # %entry
+; X64-FASTISEL-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-FASTISEL-NEXT:    fstp %st(0)
+; X64-FASTISEL-NEXT:    movb $1, %al
+; X64-FASTISEL-NEXT:    retq
+entry:
+    %0 = tail call i1 @llvm.is.fpclass.f80(x86_fp80 %x, i32 1023)
+    ret i1 %0
+}
diff --git a/llvm/test/CodeGen/X86/late-tail-dup-computed-goto.mir b/llvm/test/CodeGen/X86/late-tail-dup-computed-goto.mir
new file mode 100644
index 0000000..e272e7e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/late-tail-dup-computed-goto.mir
@@ -0,0 +1,128 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=tailduplication -tail-dup-pred-size=1 -tail-dup-succ-size=1 %s -o - | FileCheck %s
+#
+# Check that only the computed gotos are duplicated aggressively.
+#
+--- |
+  @computed_goto.dispatch = constant [5 x ptr] [ptr null, ptr blockaddress(@computed_goto, %bb1), ptr blockaddress(@computed_goto, %bb2), ptr blockaddress(@computed_goto, %bb3), ptr blockaddress(@computed_goto, %bb4)]
+  declare i64 @f0()
+  declare i64 @f1()
+  declare i64 @f2()
+  declare i64 @f3()
+  declare i64 @f4()
+  declare i64 @f5()
+  define void @computed_goto() {
+    start:
+      ret void
+    bb1:
+      ret void
+    bb2:
+      ret void
+    bb3:
+      ret void
+    bb4:
+      ret void
+  }
+  define void @jump_table() { ret void }
+  define void @jump_table_pic() { ret void }
+...
+---
+name:            computed_goto
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: computed_goto
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f0, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[COPY1]]
+  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.bb1 (ir-block-address-taken %ir-block.bb1):
+  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f1, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[COPY1]]
+  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.bb2 (ir-block-address-taken %ir-block.bb2):
+  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f2, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[COPY1]]
+  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.bb3 (ir-block-address-taken %ir-block.bb3):
+  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f3, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[COPY1]]
+  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4.bb4 (ir-block-address-taken %ir-block.bb4):
+  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f4, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[COPY1]]
+  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg
+  bb.0:
+    successors: %bb.5(0x80000000)
+
+    CALL64pcrel32 target-flags(x86-plt) @f0, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    %0:gr64 = COPY $rax
+    %6:gr64_nosp = COPY %0
+    JMP_1 %bb.5
+
+  bb.1.bb1 (ir-block-address-taken %ir-block.bb1):
+    successors: %bb.5(0x80000000)
+
+    CALL64pcrel32 target-flags(x86-plt) @f1, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    %1:gr64 = COPY $rax
+    %6:gr64_nosp = COPY %1
+    JMP_1 %bb.5
+
+  bb.2.bb2 (ir-block-address-taken %ir-block.bb2):
+    successors: %bb.5(0x80000000)
+
+    CALL64pcrel32 target-flags(x86-plt) @f2, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    %2:gr64 = COPY $rax
+    %6:gr64_nosp = COPY %2
+    JMP_1 %bb.5
+
+  bb.3.bb3 (ir-block-address-taken %ir-block.bb3):
+    successors: %bb.5(0x80000000)
+
+    CALL64pcrel32 target-flags(x86-plt) @f3, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    %3:gr64 = COPY $rax
+    %6:gr64_nosp = COPY %3
+    JMP_1 %bb.5
+
+  bb.4.bb4 (ir-block-address-taken %ir-block.bb4):
+    successors: %bb.5(0x80000000)
+
+    CALL64pcrel32 target-flags(x86-plt) @f4, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    %4:gr64 = COPY $rax
+    %6:gr64_nosp = COPY %4
+
+  bb.5:
+    successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+
+    %5:gr64_nosp = COPY %6
+    JMP64m $noreg, 8, %5, @computed_goto.dispatch, $noreg
+...
diff --git a/llvm/test/CodeGen/X86/load-combine.ll b/llvm/test/CodeGen/X86/load-combine.ll
index b5f3e78..f21c075 100644
--- a/llvm/test/CodeGen/X86/load-combine.ll
+++ b/llvm/test/CodeGen/X86/load-combine.ll
@@ -800,13 +800,13 @@ define void @shift_i32_by_32(ptr %src1, ptr %src2, ptr %dst) {
 ; CHECK-LABEL: shift_i32_by_32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl $-1, 4(%eax)
-; CHECK-NEXT:    movl $-1, (%eax)
+; CHECK-NEXT:    movl $0, 4(%eax)
+; CHECK-NEXT:    movl $0, (%eax)
 ; CHECK-NEXT:    retl
 ;
 ; CHECK64-LABEL: shift_i32_by_32:
 ; CHECK64:       # %bb.0: # %entry
-; CHECK64-NEXT:    movq $-1, (%rdx)
+; CHECK64-NEXT:    movq $0, (%rdx)
 ; CHECK64-NEXT:    retq
 entry:
   %load1 = load i8, ptr %src1, align 1
diff --git a/llvm/test/CodeGen/X86/mul128.ll b/llvm/test/CodeGen/X86/mul128.ll
index fc1cc1f..e10e48f 100644
--- a/llvm/test/CodeGen/X86/mul128.ll
+++ b/llvm/test/CodeGen/X86/mul128.ll
@@ -18,85 +18,80 @@ define i128 @foo(i128 %t, i128 %u) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    pushl %edi
-; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 28
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
-; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    imull %ecx, %ebp
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl 40(%ebp), %edi
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    imull %ecx, %esi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %esi, %eax
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    imull %edi, %eax
 ; X86-NEXT:    addl %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %ecx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    imull %ebp, %esi
+; X86-NEXT:    movl 48(%ebp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    imull 28(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    imull %edi, %esi
 ; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl %ebx, %ecx
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    mull 44(%ebp)
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edi, %ebx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    mull 44(%ebp)
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ebx, 4(%ecx)
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, 4(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, (%ecx)
 ; X86-NEXT:    movl %eax, 8(%ecx)
 ; X86-NEXT:    movl %edx, 12(%ecx)
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 20
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    popl %ebp
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl $4
   %k = mul i128 %t, %u
   ret i128 %k
diff --git a/llvm/test/CodeGen/X86/neg-abs.ll b/llvm/test/CodeGen/X86/neg-abs.ll
index 961205c..724b2dc 100644
--- a/llvm/test/CodeGen/X86/neg-abs.ll
+++ b/llvm/test/CodeGen/X86/neg-abs.ll
@@ -105,31 +105,35 @@ define i128 @neg_abs_i128(i128 %x) nounwind {
 ; X86-LABEL: neg_abs_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    movl 32(%ebp), %edx
 ; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl 28(%ebp), %esi
 ; X86-NEXT:    xorl %ecx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 24(%ebp), %edi
 ; X86-NEXT:    xorl %ecx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %ecx, %ebx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    subl %ebx, %ebp
 ; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    subl %edi, %ebx
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl %ebp, (%eax)
-; X86-NEXT:    movl %ebx, 4(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -259,37 +263,42 @@ define i64 @sub_abs_i64(i64 %x, i64 %y) nounwind {
 define i128 @sub_abs_i128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: sub_abs_i128:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 36(%ebp), %eax
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    xorl %edx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl 32(%ebp), %ecx
 ; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl 28(%ebp), %esi
 ; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 24(%ebp), %edi
 ; X86-NEXT:    xorl %edx, %edi
 ; X86-NEXT:    subl %edx, %edi
 ; X86-NEXT:    sbbl %edx, %esi
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    sbbl %edx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl 40(%ebp), %edx
 ; X86-NEXT:    subl %edi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 44(%ebp), %edi
 ; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl 48(%ebp), %esi
 ; X86-NEXT:    sbbl %ecx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl 52(%ebp), %ecx
 ; X86-NEXT:    sbbl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    movl %edi, 4(%eax)
 ; X86-NEXT:    movl %esi, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: sub_abs_i128:
diff --git a/llvm/test/CodeGen/X86/noreturn-call-win64.ll b/llvm/test/CodeGen/X86/noreturn-call-win64.ll
index 57aa022..13be1f13 100644
--- a/llvm/test/CodeGen/X86/noreturn-call-win64.ll
+++ b/llvm/test/CodeGen/X86/noreturn-call-win64.ll
@@ -111,3 +111,15 @@ declare dso_local void @"??1MakeCleanup@@QEAA@XZ"(ptr)
 ; CHECK: # %unreachable
 ; CHECK: int3
 ; CHECK: .seh_handlerdata
+
+
+define dso_local void @last_call_no_return() {
+  call void @abort1()
+  unreachable
+}
+
+; CHECK-LABEL: last_call_no_return:
+; CHECK: callq abort1
+; CHECK-NEXT: int3
+; CHECK-NEXT: .seh_endproc
+
diff --git a/llvm/test/CodeGen/X86/pcsections-atomics.ll b/llvm/test/CodeGen/X86/pcsections-atomics.ll
index 672ebc1e..69ae1f1 100644
--- a/llvm/test/CodeGen/X86/pcsections-atomics.ll
+++ b/llvm/test/CodeGen/X86/pcsections-atomics.ll
@@ -9,6 +9,7 @@
 ; RUN: llc -O1 -mattr=cx16 < %s | FileCheck %s --check-prefixes=O1
 ; RUN: llc -O2 -mattr=cx16 < %s | FileCheck %s --check-prefixes=O2
 ; RUN: llc -O3 -mattr=cx16 < %s | FileCheck %s --check-prefixes=O3
+; RUN: llc -O3 -mcpu=haswell -mattr=cx16 < %s | FileCheck %s --check-prefixes=HASWELL-O3
 
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -50,6 +51,14 @@ define void @mixed_atomic_non_atomic(ptr %a) {
 ; O3-NEXT:    movl $1, (%rdi)
 ; O3-NEXT:    decl (%rdi)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: mixed_atomic_non_atomic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    incl (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection0:
+; HASWELL-O3-NEXT:    movl $1, (%rdi)
+; HASWELL-O3-NEXT:    decl (%rdi)
+; HASWELL-O3-NEXT:    retq
 entry:
   ; Accesses the same location atomically and non-atomically.
   %0 = load volatile i32, ptr %a, align 4
@@ -107,6 +116,17 @@ define i64 @mixed_complex_atomic_non_atomic(ptr %a, ptr %b) {
 ; O3-NEXT:    movq %rdx, (%rsi)
 ; O3-NEXT:    addq %rcx, %rax
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: mixed_complex_atomic_non_atomic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movl $1, %eax
+; HASWELL-O3-NEXT:  .Lpcsection1:
+; HASWELL-O3-NEXT:    lock xaddq %rax, (%rdi)
+; HASWELL-O3-NEXT:    movq (%rsi), %rcx
+; HASWELL-O3-NEXT:    leaq 1(%rcx), %rdx
+; HASWELL-O3-NEXT:    movq %rdx, (%rsi)
+; HASWELL-O3-NEXT:    addq %rcx, %rax
+; HASWELL-O3-NEXT:    retq
 entry:
   %0 = atomicrmw add ptr %a, i64 1 monotonic, align 8, !pcsections !0
   %1 = load i64, ptr %b, align 8
@@ -148,6 +168,14 @@ define i8 @atomic8_load_unordered(ptr %a) {
 ; O3-NEXT:    movzbl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_load_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection2:
+; HASWELL-O3-NEXT:    movzbl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i8, ptr %a unordered, align 1, !pcsections !0
@@ -187,6 +215,14 @@ define i8 @atomic8_load_monotonic(ptr %a) {
 ; O3-NEXT:    movzbl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_load_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection3:
+; HASWELL-O3-NEXT:    movzbl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i8, ptr %a monotonic, align 1, !pcsections !0
@@ -226,6 +262,14 @@ define i8 @atomic8_load_acquire(ptr %a) {
 ; O3-NEXT:    movzbl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_load_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection4:
+; HASWELL-O3-NEXT:    movzbl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i8, ptr %a acquire, align 1, !pcsections !0
@@ -265,6 +309,14 @@ define i8 @atomic8_load_seq_cst(ptr %a) {
 ; O3-NEXT:    movzbl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_load_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection5:
+; HASWELL-O3-NEXT:    movzbl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i8, ptr %a seq_cst, align 1, !pcsections !0
@@ -304,6 +356,14 @@ define void @atomic8_store_unordered(ptr %a) {
 ; O3-NEXT:    movb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_store_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection6:
+; HASWELL-O3-NEXT:    movb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i8 42, ptr %a unordered, align 1, !pcsections !0
@@ -343,6 +403,14 @@ define void @atomic8_store_monotonic(ptr %a) {
 ; O3-NEXT:    movb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_store_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection7:
+; HASWELL-O3-NEXT:    movb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i8 42, ptr %a monotonic, align 1, !pcsections !0
@@ -382,6 +450,14 @@ define void @atomic8_store_release(ptr %a) {
 ; O3-NEXT:    movb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_store_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection8:
+; HASWELL-O3-NEXT:    movb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i8 42, ptr %a release, align 1, !pcsections !0
@@ -425,6 +501,15 @@ define void @atomic8_store_seq_cst(ptr %a) {
 ; O3-NEXT:    xchgb %al, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_store_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection9:
+; HASWELL-O3-NEXT:    xchgb %al, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i8 42, ptr %a seq_cst, align 1, !pcsections !0
@@ -468,6 +553,15 @@ define void @atomic8_xchg_monotonic(ptr %a) {
 ; O3-NEXT:    xchgb %al, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xchg_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection10:
+; HASWELL-O3-NEXT:    xchgb %al, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i8 42 monotonic, align 1, !pcsections !0
@@ -507,6 +601,14 @@ define void @atomic8_add_monotonic(ptr %a) {
 ; O3-NEXT:    lock addb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_add_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection11:
+; HASWELL-O3-NEXT:    lock addb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i8 42 monotonic, align 1, !pcsections !0
@@ -546,6 +648,14 @@ define void @atomic8_sub_monotonic(ptr %a) {
 ; O3-NEXT:    lock subb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_sub_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection12:
+; HASWELL-O3-NEXT:    lock subb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i8 42 monotonic, align 1, !pcsections !0
@@ -585,6 +695,14 @@ define void @atomic8_and_monotonic(ptr %a) {
 ; O3-NEXT:    lock andb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_and_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection13:
+; HASWELL-O3-NEXT:    lock andb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i8 42 monotonic, align 1, !pcsections !0
@@ -624,6 +742,14 @@ define void @atomic8_or_monotonic(ptr %a) {
 ; O3-NEXT:    lock orb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_or_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection14:
+; HASWELL-O3-NEXT:    lock orb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i8 42 monotonic, align 1, !pcsections !0
@@ -663,6 +789,14 @@ define void @atomic8_xor_monotonic(ptr %a) {
 ; O3-NEXT:    lock xorb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xor_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection15:
+; HASWELL-O3-NEXT:    lock xorb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i8 42 monotonic, align 1, !pcsections !0
@@ -763,6 +897,27 @@ define void @atomic8_nand_monotonic(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_nand_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection16:
+; HASWELL-O3-NEXT:    movzbl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB16_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection17:
+; HASWELL-O3-NEXT:    notb %cl
+; HASWELL-O3-NEXT:  .Lpcsection18:
+; HASWELL-O3-NEXT:    orb $-43, %cl
+; HASWELL-O3-NEXT:  .Lpcsection19:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection20:
+; HASWELL-O3-NEXT:    jne .LBB16_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i8 42 monotonic, align 1, !pcsections !0
@@ -806,6 +961,15 @@ define void @atomic8_xchg_acquire(ptr %a) {
 ; O3-NEXT:    xchgb %al, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xchg_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection21:
+; HASWELL-O3-NEXT:    xchgb %al, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i8 42 acquire, align 1, !pcsections !0
@@ -845,6 +1009,14 @@ define void @atomic8_add_acquire(ptr %a) {
 ; O3-NEXT:    lock addb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_add_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection22:
+; HASWELL-O3-NEXT:    lock addb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i8 42 acquire, align 1, !pcsections !0
@@ -884,6 +1056,14 @@ define void @atomic8_sub_acquire(ptr %a) {
 ; O3-NEXT:    lock subb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_sub_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection23:
+; HASWELL-O3-NEXT:    lock subb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i8 42 acquire, align 1, !pcsections !0
@@ -923,6 +1103,14 @@ define void @atomic8_and_acquire(ptr %a) {
 ; O3-NEXT:    lock andb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_and_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection24:
+; HASWELL-O3-NEXT:    lock andb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i8 42 acquire, align 1, !pcsections !0
@@ -962,6 +1150,14 @@ define void @atomic8_or_acquire(ptr %a) {
 ; O3-NEXT:    lock orb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_or_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection25:
+; HASWELL-O3-NEXT:    lock orb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i8 42 acquire, align 1, !pcsections !0
@@ -1001,6 +1197,14 @@ define void @atomic8_xor_acquire(ptr %a) {
 ; O3-NEXT:    lock xorb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xor_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection26:
+; HASWELL-O3-NEXT:    lock xorb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i8 42 acquire, align 1, !pcsections !0
@@ -1101,6 +1305,27 @@ define void @atomic8_nand_acquire(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_nand_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection27:
+; HASWELL-O3-NEXT:    movzbl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB23_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection28:
+; HASWELL-O3-NEXT:    notb %cl
+; HASWELL-O3-NEXT:  .Lpcsection29:
+; HASWELL-O3-NEXT:    orb $-43, %cl
+; HASWELL-O3-NEXT:  .Lpcsection30:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection31:
+; HASWELL-O3-NEXT:    jne .LBB23_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i8 42 acquire, align 1, !pcsections !0
@@ -1144,6 +1369,15 @@ define void @atomic8_xchg_release(ptr %a) {
 ; O3-NEXT:    xchgb %al, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xchg_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection32:
+; HASWELL-O3-NEXT:    xchgb %al, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i8 42 release, align 1, !pcsections !0
@@ -1183,6 +1417,14 @@ define void @atomic8_add_release(ptr %a) {
 ; O3-NEXT:    lock addb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_add_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection33:
+; HASWELL-O3-NEXT:    lock addb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i8 42 release, align 1, !pcsections !0
@@ -1222,6 +1464,14 @@ define void @atomic8_sub_release(ptr %a) {
 ; O3-NEXT:    lock subb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_sub_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection34:
+; HASWELL-O3-NEXT:    lock subb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i8 42 release, align 1, !pcsections !0
@@ -1261,6 +1511,14 @@ define void @atomic8_and_release(ptr %a) {
 ; O3-NEXT:    lock andb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_and_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection35:
+; HASWELL-O3-NEXT:    lock andb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i8 42 release, align 1, !pcsections !0
@@ -1300,6 +1558,14 @@ define void @atomic8_or_release(ptr %a) {
 ; O3-NEXT:    lock orb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_or_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection36:
+; HASWELL-O3-NEXT:    lock orb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i8 42 release, align 1, !pcsections !0
@@ -1339,6 +1605,14 @@ define void @atomic8_xor_release(ptr %a) {
 ; O3-NEXT:    lock xorb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xor_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection37:
+; HASWELL-O3-NEXT:    lock xorb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i8 42 release, align 1, !pcsections !0
@@ -1439,6 +1713,27 @@ define void @atomic8_nand_release(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_nand_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection38:
+; HASWELL-O3-NEXT:    movzbl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB30_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection39:
+; HASWELL-O3-NEXT:    notb %cl
+; HASWELL-O3-NEXT:  .Lpcsection40:
+; HASWELL-O3-NEXT:    orb $-43, %cl
+; HASWELL-O3-NEXT:  .Lpcsection41:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection42:
+; HASWELL-O3-NEXT:    jne .LBB30_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i8 42 release, align 1, !pcsections !0
@@ -1482,6 +1777,15 @@ define void @atomic8_xchg_acq_rel(ptr %a) {
 ; O3-NEXT:    xchgb %al, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xchg_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection43:
+; HASWELL-O3-NEXT:    xchgb %al, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i8 42 acq_rel, align 1, !pcsections !0
@@ -1521,6 +1825,14 @@ define void @atomic8_add_acq_rel(ptr %a) {
 ; O3-NEXT:    lock addb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_add_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection44:
+; HASWELL-O3-NEXT:    lock addb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i8 42 acq_rel, align 1, !pcsections !0
@@ -1560,6 +1872,14 @@ define void @atomic8_sub_acq_rel(ptr %a) {
 ; O3-NEXT:    lock subb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_sub_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection45:
+; HASWELL-O3-NEXT:    lock subb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i8 42 acq_rel, align 1, !pcsections !0
@@ -1599,6 +1919,14 @@ define void @atomic8_and_acq_rel(ptr %a) {
 ; O3-NEXT:    lock andb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_and_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection46:
+; HASWELL-O3-NEXT:    lock andb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i8 42 acq_rel, align 1, !pcsections !0
@@ -1638,6 +1966,14 @@ define void @atomic8_or_acq_rel(ptr %a) {
 ; O3-NEXT:    lock orb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_or_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection47:
+; HASWELL-O3-NEXT:    lock orb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i8 42 acq_rel, align 1, !pcsections !0
@@ -1677,6 +2013,14 @@ define void @atomic8_xor_acq_rel(ptr %a) {
 ; O3-NEXT:    lock xorb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xor_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection48:
+; HASWELL-O3-NEXT:    lock xorb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i8 42 acq_rel, align 1, !pcsections !0
@@ -1777,6 +2121,27 @@ define void @atomic8_nand_acq_rel(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_nand_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection49:
+; HASWELL-O3-NEXT:    movzbl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB37_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection50:
+; HASWELL-O3-NEXT:    notb %cl
+; HASWELL-O3-NEXT:  .Lpcsection51:
+; HASWELL-O3-NEXT:    orb $-43, %cl
+; HASWELL-O3-NEXT:  .Lpcsection52:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection53:
+; HASWELL-O3-NEXT:    jne .LBB37_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i8 42 acq_rel, align 1, !pcsections !0
@@ -1820,6 +2185,15 @@ define void @atomic8_xchg_seq_cst(ptr %a) {
 ; O3-NEXT:    xchgb %al, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xchg_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection54:
+; HASWELL-O3-NEXT:    xchgb %al, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i8 42 seq_cst, align 1, !pcsections !0
@@ -1859,6 +2233,14 @@ define void @atomic8_add_seq_cst(ptr %a) {
 ; O3-NEXT:    lock addb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_add_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection55:
+; HASWELL-O3-NEXT:    lock addb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i8 42 seq_cst, align 1, !pcsections !0
@@ -1898,6 +2280,14 @@ define void @atomic8_sub_seq_cst(ptr %a) {
 ; O3-NEXT:    lock subb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_sub_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection56:
+; HASWELL-O3-NEXT:    lock subb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i8 42 seq_cst, align 1, !pcsections !0
@@ -1937,6 +2327,14 @@ define void @atomic8_and_seq_cst(ptr %a) {
 ; O3-NEXT:    lock andb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_and_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection57:
+; HASWELL-O3-NEXT:    lock andb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i8 42 seq_cst, align 1, !pcsections !0
@@ -1976,6 +2374,14 @@ define void @atomic8_or_seq_cst(ptr %a) {
 ; O3-NEXT:    lock orb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_or_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection58:
+; HASWELL-O3-NEXT:    lock orb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i8 42 seq_cst, align 1, !pcsections !0
@@ -2015,6 +2421,14 @@ define void @atomic8_xor_seq_cst(ptr %a) {
 ; O3-NEXT:    lock xorb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xor_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection59:
+; HASWELL-O3-NEXT:    lock xorb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i8 42 seq_cst, align 1, !pcsections !0
@@ -2115,6 +2529,27 @@ define void @atomic8_nand_seq_cst(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_nand_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection60:
+; HASWELL-O3-NEXT:    movzbl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB44_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection61:
+; HASWELL-O3-NEXT:    notb %cl
+; HASWELL-O3-NEXT:  .Lpcsection62:
+; HASWELL-O3-NEXT:    orb $-43, %cl
+; HASWELL-O3-NEXT:  .Lpcsection63:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection64:
+; HASWELL-O3-NEXT:    jne .LBB44_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i8 42 seq_cst, align 1, !pcsections !0
@@ -2200,6 +2635,25 @@ define void @atomic8_cas_monotonic(ptr %a) {
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_cas_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $1, %cl
+; HASWELL-O3-NEXT:  .Lpcsection65:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection66:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection67:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection68:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection69:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection70:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i8 42, i8 1 monotonic monotonic, align 1, !pcsections !0
@@ -2287,6 +2741,25 @@ define void @atomic8_cas_acquire(ptr %a) {
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_cas_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $1, %cl
+; HASWELL-O3-NEXT:  .Lpcsection71:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection72:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection73:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection74:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection75:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection76:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i8 42, i8 1 acquire monotonic, align 1, !pcsections !0
@@ -2374,6 +2847,25 @@ define void @atomic8_cas_release(ptr %a) {
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_cas_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $1, %cl
+; HASWELL-O3-NEXT:  .Lpcsection77:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection78:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection79:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection80:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection81:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection82:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i8 42, i8 1 release monotonic, align 1, !pcsections !0
@@ -2461,6 +2953,25 @@ define void @atomic8_cas_acq_rel(ptr %a) {
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_cas_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $1, %cl
+; HASWELL-O3-NEXT:  .Lpcsection83:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection84:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection85:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection86:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection87:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection88:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i8 42, i8 1 acq_rel monotonic, align 1, !pcsections !0
@@ -2548,6 +3059,25 @@ define void @atomic8_cas_seq_cst(ptr %a) {
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_cas_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $1, %cl
+; HASWELL-O3-NEXT:  .Lpcsection89:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection90:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection91:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection92:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection93:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection94:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i8 42, i8 1 seq_cst monotonic, align 1, !pcsections !0
@@ -2589,6 +3119,14 @@ define i16 @atomic16_load_unordered(ptr %a) {
 ; O3-NEXT:    movzwl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_load_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection95:
+; HASWELL-O3-NEXT:    movzwl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i16, ptr %a unordered, align 2, !pcsections !0
@@ -2628,6 +3166,14 @@ define i16 @atomic16_load_monotonic(ptr %a) {
 ; O3-NEXT:    movzwl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_load_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection96:
+; HASWELL-O3-NEXT:    movzwl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i16, ptr %a monotonic, align 2, !pcsections !0
@@ -2667,6 +3213,14 @@ define i16 @atomic16_load_acquire(ptr %a) {
 ; O3-NEXT:    movzwl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_load_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection97:
+; HASWELL-O3-NEXT:    movzwl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i16, ptr %a acquire, align 2, !pcsections !0
@@ -2706,6 +3260,14 @@ define i16 @atomic16_load_seq_cst(ptr %a) {
 ; O3-NEXT:    movzwl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_load_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection98:
+; HASWELL-O3-NEXT:    movzwl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i16, ptr %a seq_cst, align 2, !pcsections !0
@@ -2745,6 +3307,14 @@ define void @atomic16_store_unordered(ptr %a) {
 ; O3-NEXT:    movw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_store_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection99:
+; HASWELL-O3-NEXT:    movw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i16 42, ptr %a unordered, align 2, !pcsections !0
@@ -2784,6 +3354,14 @@ define void @atomic16_store_monotonic(ptr %a) {
 ; O3-NEXT:    movw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_store_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection100:
+; HASWELL-O3-NEXT:    movw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i16 42, ptr %a monotonic, align 2, !pcsections !0
@@ -2823,6 +3401,14 @@ define void @atomic16_store_release(ptr %a) {
 ; O3-NEXT:    movw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_store_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection101:
+; HASWELL-O3-NEXT:    movw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i16 42, ptr %a release, align 2, !pcsections !0
@@ -2866,6 +3452,15 @@ define void @atomic16_store_seq_cst(ptr %a) {
 ; O3-NEXT:    xchgw %ax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_store_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection102:
+; HASWELL-O3-NEXT:    xchgw %ax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i16 42, ptr %a seq_cst, align 2, !pcsections !0
@@ -2909,6 +3504,15 @@ define void @atomic16_xchg_monotonic(ptr %a) {
 ; O3-NEXT:    xchgw %ax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xchg_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection103:
+; HASWELL-O3-NEXT:    xchgw %ax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i16 42 monotonic, align 2, !pcsections !0
@@ -2948,6 +3552,14 @@ define void @atomic16_add_monotonic(ptr %a) {
 ; O3-NEXT:    lock addw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_add_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection104:
+; HASWELL-O3-NEXT:    lock addw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i16 42 monotonic, align 2, !pcsections !0
@@ -2987,6 +3599,14 @@ define void @atomic16_sub_monotonic(ptr %a) {
 ; O3-NEXT:    lock subw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_sub_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection105:
+; HASWELL-O3-NEXT:    lock subw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i16 42 monotonic, align 2, !pcsections !0
@@ -3026,6 +3646,14 @@ define void @atomic16_and_monotonic(ptr %a) {
 ; O3-NEXT:    lock andw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_and_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection106:
+; HASWELL-O3-NEXT:    lock andw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i16 42 monotonic, align 2, !pcsections !0
@@ -3065,6 +3693,14 @@ define void @atomic16_or_monotonic(ptr %a) {
 ; O3-NEXT:    lock orw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_or_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection107:
+; HASWELL-O3-NEXT:    lock orw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i16 42 monotonic, align 2, !pcsections !0
@@ -3104,6 +3740,14 @@ define void @atomic16_xor_monotonic(ptr %a) {
 ; O3-NEXT:    lock xorw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xor_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection108:
+; HASWELL-O3-NEXT:    lock xorw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i16 42 monotonic, align 2, !pcsections !0
@@ -3220,6 +3864,31 @@ define void @atomic16_nand_monotonic(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_nand_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection109:
+; HASWELL-O3-NEXT:    movzwl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB64_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection110:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection111:
+; HASWELL-O3-NEXT:    orl $65493, %ecx # imm = 0xFFD5
+; HASWELL-O3-NEXT:  .Lpcsection112:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax killed $eax
+; HASWELL-O3-NEXT:  .Lpcsection113:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection114:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax def $eax
+; HASWELL-O3-NEXT:  .Lpcsection115:
+; HASWELL-O3-NEXT:    jne .LBB64_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i16 42 monotonic, align 2, !pcsections !0
@@ -3263,6 +3932,15 @@ define void @atomic16_xchg_acquire(ptr %a) {
 ; O3-NEXT:    xchgw %ax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xchg_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection116:
+; HASWELL-O3-NEXT:    xchgw %ax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i16 42 acquire, align 2, !pcsections !0
@@ -3302,6 +3980,14 @@ define void @atomic16_add_acquire(ptr %a) {
 ; O3-NEXT:    lock addw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_add_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection117:
+; HASWELL-O3-NEXT:    lock addw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i16 42 acquire, align 2, !pcsections !0
@@ -3341,6 +4027,14 @@ define void @atomic16_sub_acquire(ptr %a) {
 ; O3-NEXT:    lock subw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_sub_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection118:
+; HASWELL-O3-NEXT:    lock subw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i16 42 acquire, align 2, !pcsections !0
@@ -3380,6 +4074,14 @@ define void @atomic16_and_acquire(ptr %a) {
 ; O3-NEXT:    lock andw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_and_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection119:
+; HASWELL-O3-NEXT:    lock andw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i16 42 acquire, align 2, !pcsections !0
@@ -3419,6 +4121,14 @@ define void @atomic16_or_acquire(ptr %a) {
 ; O3-NEXT:    lock orw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_or_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection120:
+; HASWELL-O3-NEXT:    lock orw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i16 42 acquire, align 2, !pcsections !0
@@ -3458,6 +4168,14 @@ define void @atomic16_xor_acquire(ptr %a) {
 ; O3-NEXT:    lock xorw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xor_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection121:
+; HASWELL-O3-NEXT:    lock xorw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i16 42 acquire, align 2, !pcsections !0
@@ -3574,6 +4292,31 @@ define void @atomic16_nand_acquire(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_nand_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection122:
+; HASWELL-O3-NEXT:    movzwl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB71_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection123:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection124:
+; HASWELL-O3-NEXT:    orl $65493, %ecx # imm = 0xFFD5
+; HASWELL-O3-NEXT:  .Lpcsection125:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax killed $eax
+; HASWELL-O3-NEXT:  .Lpcsection126:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection127:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax def $eax
+; HASWELL-O3-NEXT:  .Lpcsection128:
+; HASWELL-O3-NEXT:    jne .LBB71_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i16 42 acquire, align 2, !pcsections !0
@@ -3617,6 +4360,15 @@ define void @atomic16_xchg_release(ptr %a) {
 ; O3-NEXT:    xchgw %ax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xchg_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection129:
+; HASWELL-O3-NEXT:    xchgw %ax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i16 42 release, align 2, !pcsections !0
@@ -3656,6 +4408,14 @@ define void @atomic16_add_release(ptr %a) {
 ; O3-NEXT:    lock addw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_add_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection130:
+; HASWELL-O3-NEXT:    lock addw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i16 42 release, align 2, !pcsections !0
@@ -3695,6 +4455,14 @@ define void @atomic16_sub_release(ptr %a) {
 ; O3-NEXT:    lock subw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_sub_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection131:
+; HASWELL-O3-NEXT:    lock subw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i16 42 release, align 2, !pcsections !0
@@ -3734,6 +4502,14 @@ define void @atomic16_and_release(ptr %a) {
 ; O3-NEXT:    lock andw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_and_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection132:
+; HASWELL-O3-NEXT:    lock andw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i16 42 release, align 2, !pcsections !0
@@ -3773,6 +4549,14 @@ define void @atomic16_or_release(ptr %a) {
 ; O3-NEXT:    lock orw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_or_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection133:
+; HASWELL-O3-NEXT:    lock orw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i16 42 release, align 2, !pcsections !0
@@ -3812,6 +4596,14 @@ define void @atomic16_xor_release(ptr %a) {
 ; O3-NEXT:    lock xorw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xor_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection134:
+; HASWELL-O3-NEXT:    lock xorw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i16 42 release, align 2, !pcsections !0
@@ -3928,6 +4720,31 @@ define void @atomic16_nand_release(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_nand_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection135:
+; HASWELL-O3-NEXT:    movzwl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB78_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection136:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection137:
+; HASWELL-O3-NEXT:    orl $65493, %ecx # imm = 0xFFD5
+; HASWELL-O3-NEXT:  .Lpcsection138:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax killed $eax
+; HASWELL-O3-NEXT:  .Lpcsection139:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection140:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax def $eax
+; HASWELL-O3-NEXT:  .Lpcsection141:
+; HASWELL-O3-NEXT:    jne .LBB78_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i16 42 release, align 2, !pcsections !0
@@ -3971,6 +4788,15 @@ define void @atomic16_xchg_acq_rel(ptr %a) {
 ; O3-NEXT:    xchgw %ax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xchg_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection142:
+; HASWELL-O3-NEXT:    xchgw %ax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i16 42 acq_rel, align 2, !pcsections !0
@@ -4010,6 +4836,14 @@ define void @atomic16_add_acq_rel(ptr %a) {
 ; O3-NEXT:    lock addw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_add_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection143:
+; HASWELL-O3-NEXT:    lock addw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i16 42 acq_rel, align 2, !pcsections !0
@@ -4049,6 +4883,14 @@ define void @atomic16_sub_acq_rel(ptr %a) {
 ; O3-NEXT:    lock subw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_sub_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection144:
+; HASWELL-O3-NEXT:    lock subw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i16 42 acq_rel, align 2, !pcsections !0
@@ -4088,6 +4930,14 @@ define void @atomic16_and_acq_rel(ptr %a) {
 ; O3-NEXT:    lock andw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_and_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection145:
+; HASWELL-O3-NEXT:    lock andw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i16 42 acq_rel, align 2, !pcsections !0
@@ -4127,6 +4977,14 @@ define void @atomic16_or_acq_rel(ptr %a) {
 ; O3-NEXT:    lock orw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_or_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection146:
+; HASWELL-O3-NEXT:    lock orw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i16 42 acq_rel, align 2, !pcsections !0
@@ -4166,6 +5024,14 @@ define void @atomic16_xor_acq_rel(ptr %a) {
 ; O3-NEXT:    lock xorw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xor_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection147:
+; HASWELL-O3-NEXT:    lock xorw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i16 42 acq_rel, align 2, !pcsections !0
@@ -4282,6 +5148,31 @@ define void @atomic16_nand_acq_rel(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_nand_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection148:
+; HASWELL-O3-NEXT:    movzwl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB85_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection149:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection150:
+; HASWELL-O3-NEXT:    orl $65493, %ecx # imm = 0xFFD5
+; HASWELL-O3-NEXT:  .Lpcsection151:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax killed $eax
+; HASWELL-O3-NEXT:  .Lpcsection152:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection153:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax def $eax
+; HASWELL-O3-NEXT:  .Lpcsection154:
+; HASWELL-O3-NEXT:    jne .LBB85_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i16 42 acq_rel, align 2, !pcsections !0
@@ -4325,6 +5216,15 @@ define void @atomic16_xchg_seq_cst(ptr %a) {
 ; O3-NEXT:    xchgw %ax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xchg_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection155:
+; HASWELL-O3-NEXT:    xchgw %ax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i16 42 seq_cst, align 2, !pcsections !0
@@ -4364,6 +5264,14 @@ define void @atomic16_add_seq_cst(ptr %a) {
 ; O3-NEXT:    lock addw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_add_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection156:
+; HASWELL-O3-NEXT:    lock addw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i16 42 seq_cst, align 2, !pcsections !0
@@ -4403,6 +5311,14 @@ define void @atomic16_sub_seq_cst(ptr %a) {
 ; O3-NEXT:    lock subw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_sub_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection157:
+; HASWELL-O3-NEXT:    lock subw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i16 42 seq_cst, align 2, !pcsections !0
@@ -4442,6 +5358,14 @@ define void @atomic16_and_seq_cst(ptr %a) {
 ; O3-NEXT:    lock andw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_and_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection158:
+; HASWELL-O3-NEXT:    lock andw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i16 42 seq_cst, align 2, !pcsections !0
@@ -4481,6 +5405,14 @@ define void @atomic16_or_seq_cst(ptr %a) {
 ; O3-NEXT:    lock orw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_or_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection159:
+; HASWELL-O3-NEXT:    lock orw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i16 42 seq_cst, align 2, !pcsections !0
@@ -4520,6 +5452,14 @@ define void @atomic16_xor_seq_cst(ptr %a) {
 ; O3-NEXT:    lock xorw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xor_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection160:
+; HASWELL-O3-NEXT:    lock xorw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i16 42 seq_cst, align 2, !pcsections !0
@@ -4636,6 +5576,31 @@ define void @atomic16_nand_seq_cst(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_nand_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection161:
+; HASWELL-O3-NEXT:    movzwl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB92_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection162:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection163:
+; HASWELL-O3-NEXT:    orl $65493, %ecx # imm = 0xFFD5
+; HASWELL-O3-NEXT:  .Lpcsection164:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax killed $eax
+; HASWELL-O3-NEXT:  .Lpcsection165:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection166:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax def $eax
+; HASWELL-O3-NEXT:  .Lpcsection167:
+; HASWELL-O3-NEXT:    jne .LBB92_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i16 42 seq_cst, align 2, !pcsections !0
@@ -4712,6 +5677,22 @@ define void @atomic16_cas_monotonic(ptr %a) {
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_cas_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $1, %cx
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection168:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection169:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection170:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i16 42, i16 1 monotonic monotonic, align 2, !pcsections !0
@@ -4790,6 +5771,22 @@ define void @atomic16_cas_acquire(ptr %a) {
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_cas_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $1, %cx
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection171:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection172:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection173:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i16 42, i16 1 acquire monotonic, align 2, !pcsections !0
@@ -4868,6 +5865,22 @@ define void @atomic16_cas_release(ptr %a) {
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_cas_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $1, %cx
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection174:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection175:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection176:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i16 42, i16 1 release monotonic, align 2, !pcsections !0
@@ -4946,6 +5959,22 @@ define void @atomic16_cas_acq_rel(ptr %a) {
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_cas_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $1, %cx
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection177:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection178:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection179:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i16 42, i16 1 acq_rel monotonic, align 2, !pcsections !0
@@ -5024,6 +6053,22 @@ define void @atomic16_cas_seq_cst(ptr %a) {
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_cas_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $1, %cx
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection180:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection181:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection182:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i16 42, i16 1 seq_cst monotonic, align 2, !pcsections !0
@@ -5065,6 +6110,14 @@ define i32 @atomic32_load_unordered(ptr %a) {
 ; O3-NEXT:    movl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_load_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection183:
+; HASWELL-O3-NEXT:    movl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i32, ptr %a unordered, align 4, !pcsections !0
@@ -5104,6 +6157,14 @@ define i32 @atomic32_load_monotonic(ptr %a) {
 ; O3-NEXT:    movl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_load_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection184:
+; HASWELL-O3-NEXT:    movl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i32, ptr %a monotonic, align 4, !pcsections !0
@@ -5143,6 +6204,14 @@ define i32 @atomic32_load_acquire(ptr %a) {
 ; O3-NEXT:    movl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_load_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection185:
+; HASWELL-O3-NEXT:    movl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i32, ptr %a acquire, align 4, !pcsections !0
@@ -5182,6 +6251,14 @@ define i32 @atomic32_load_seq_cst(ptr %a) {
 ; O3-NEXT:    movl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_load_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection186:
+; HASWELL-O3-NEXT:    movl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i32, ptr %a seq_cst, align 4, !pcsections !0
@@ -5221,6 +6298,14 @@ define void @atomic32_store_unordered(ptr %a) {
 ; O3-NEXT:    movl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_store_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection187:
+; HASWELL-O3-NEXT:    movl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i32 42, ptr %a unordered, align 4, !pcsections !0
@@ -5260,6 +6345,14 @@ define void @atomic32_store_monotonic(ptr %a) {
 ; O3-NEXT:    movl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_store_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection188:
+; HASWELL-O3-NEXT:    movl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i32 42, ptr %a monotonic, align 4, !pcsections !0
@@ -5299,6 +6392,14 @@ define void @atomic32_store_release(ptr %a) {
 ; O3-NEXT:    movl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_store_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection189:
+; HASWELL-O3-NEXT:    movl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i32 42, ptr %a release, align 4, !pcsections !0
@@ -5342,6 +6443,15 @@ define void @atomic32_store_seq_cst(ptr %a) {
 ; O3-NEXT:    xchgl %eax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_store_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection190:
+; HASWELL-O3-NEXT:    xchgl %eax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i32 42, ptr %a seq_cst, align 4, !pcsections !0
@@ -5385,6 +6495,15 @@ define void @atomic32_xchg_monotonic(ptr %a) {
 ; O3-NEXT:    xchgl %eax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xchg_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection191:
+; HASWELL-O3-NEXT:    xchgl %eax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i32 42 monotonic, align 4, !pcsections !0
@@ -5424,6 +6543,14 @@ define void @atomic32_add_monotonic(ptr %a) {
 ; O3-NEXT:    lock addl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_add_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection192:
+; HASWELL-O3-NEXT:    lock addl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i32 42 monotonic, align 4, !pcsections !0
@@ -5463,6 +6590,14 @@ define void @atomic32_sub_monotonic(ptr %a) {
 ; O3-NEXT:    lock subl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_sub_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection193:
+; HASWELL-O3-NEXT:    lock subl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i32 42 monotonic, align 4, !pcsections !0
@@ -5502,6 +6637,14 @@ define void @atomic32_and_monotonic(ptr %a) {
 ; O3-NEXT:    lock andl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_and_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection194:
+; HASWELL-O3-NEXT:    lock andl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i32 42 monotonic, align 4, !pcsections !0
@@ -5541,6 +6684,14 @@ define void @atomic32_or_monotonic(ptr %a) {
 ; O3-NEXT:    lock orl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_or_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection195:
+; HASWELL-O3-NEXT:    lock orl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i32 42 monotonic, align 4, !pcsections !0
@@ -5580,6 +6731,14 @@ define void @atomic32_xor_monotonic(ptr %a) {
 ; O3-NEXT:    lock xorl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xor_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection196:
+; HASWELL-O3-NEXT:    lock xorl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i32 42 monotonic, align 4, !pcsections !0
@@ -5680,6 +6839,27 @@ define void @atomic32_nand_monotonic(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_nand_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection197:
+; HASWELL-O3-NEXT:    movl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB112_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection198:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection199:
+; HASWELL-O3-NEXT:    orl $-43, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection200:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection201:
+; HASWELL-O3-NEXT:    jne .LBB112_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i32 42 monotonic, align 4, !pcsections !0
@@ -5723,6 +6903,15 @@ define void @atomic32_xchg_acquire(ptr %a) {
 ; O3-NEXT:    xchgl %eax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xchg_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection202:
+; HASWELL-O3-NEXT:    xchgl %eax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i32 42 acquire, align 4, !pcsections !0
@@ -5762,6 +6951,14 @@ define void @atomic32_add_acquire(ptr %a) {
 ; O3-NEXT:    lock addl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_add_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection203:
+; HASWELL-O3-NEXT:    lock addl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i32 42 acquire, align 4, !pcsections !0
@@ -5801,6 +6998,14 @@ define void @atomic32_sub_acquire(ptr %a) {
 ; O3-NEXT:    lock subl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_sub_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection204:
+; HASWELL-O3-NEXT:    lock subl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i32 42 acquire, align 4, !pcsections !0
@@ -5840,6 +7045,14 @@ define void @atomic32_and_acquire(ptr %a) {
 ; O3-NEXT:    lock andl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_and_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection205:
+; HASWELL-O3-NEXT:    lock andl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i32 42 acquire, align 4, !pcsections !0
@@ -5879,6 +7092,14 @@ define void @atomic32_or_acquire(ptr %a) {
 ; O3-NEXT:    lock orl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_or_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection206:
+; HASWELL-O3-NEXT:    lock orl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i32 42 acquire, align 4, !pcsections !0
@@ -5918,6 +7139,14 @@ define void @atomic32_xor_acquire(ptr %a) {
 ; O3-NEXT:    lock xorl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xor_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection207:
+; HASWELL-O3-NEXT:    lock xorl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i32 42 acquire, align 4, !pcsections !0
@@ -6018,6 +7247,27 @@ define void @atomic32_nand_acquire(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_nand_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection208:
+; HASWELL-O3-NEXT:    movl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB119_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection209:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection210:
+; HASWELL-O3-NEXT:    orl $-43, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection211:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection212:
+; HASWELL-O3-NEXT:    jne .LBB119_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i32 42 acquire, align 4, !pcsections !0
@@ -6061,6 +7311,15 @@ define void @atomic32_xchg_release(ptr %a) {
 ; O3-NEXT:    xchgl %eax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xchg_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection213:
+; HASWELL-O3-NEXT:    xchgl %eax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i32 42 release, align 4, !pcsections !0
@@ -6100,6 +7359,14 @@ define void @atomic32_add_release(ptr %a) {
 ; O3-NEXT:    lock addl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_add_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection214:
+; HASWELL-O3-NEXT:    lock addl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i32 42 release, align 4, !pcsections !0
@@ -6139,6 +7406,14 @@ define void @atomic32_sub_release(ptr %a) {
 ; O3-NEXT:    lock subl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_sub_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection215:
+; HASWELL-O3-NEXT:    lock subl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i32 42 release, align 4, !pcsections !0
@@ -6178,6 +7453,14 @@ define void @atomic32_and_release(ptr %a) {
 ; O3-NEXT:    lock andl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_and_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection216:
+; HASWELL-O3-NEXT:    lock andl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i32 42 release, align 4, !pcsections !0
@@ -6217,6 +7500,14 @@ define void @atomic32_or_release(ptr %a) {
 ; O3-NEXT:    lock orl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_or_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection217:
+; HASWELL-O3-NEXT:    lock orl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i32 42 release, align 4, !pcsections !0
@@ -6256,6 +7547,14 @@ define void @atomic32_xor_release(ptr %a) {
 ; O3-NEXT:    lock xorl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xor_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection218:
+; HASWELL-O3-NEXT:    lock xorl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i32 42 release, align 4, !pcsections !0
@@ -6356,6 +7655,27 @@ define void @atomic32_nand_release(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_nand_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection219:
+; HASWELL-O3-NEXT:    movl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB126_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection220:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection221:
+; HASWELL-O3-NEXT:    orl $-43, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection222:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection223:
+; HASWELL-O3-NEXT:    jne .LBB126_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i32 42 release, align 4, !pcsections !0
@@ -6399,6 +7719,15 @@ define void @atomic32_xchg_acq_rel(ptr %a) {
 ; O3-NEXT:    xchgl %eax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xchg_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection224:
+; HASWELL-O3-NEXT:    xchgl %eax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i32 42 acq_rel, align 4, !pcsections !0
@@ -6438,6 +7767,14 @@ define void @atomic32_add_acq_rel(ptr %a) {
 ; O3-NEXT:    lock addl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_add_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection225:
+; HASWELL-O3-NEXT:    lock addl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i32 42 acq_rel, align 4, !pcsections !0
@@ -6477,6 +7814,14 @@ define void @atomic32_sub_acq_rel(ptr %a) {
 ; O3-NEXT:    lock subl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_sub_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection226:
+; HASWELL-O3-NEXT:    lock subl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i32 42 acq_rel, align 4, !pcsections !0
@@ -6516,6 +7861,14 @@ define void @atomic32_and_acq_rel(ptr %a) {
 ; O3-NEXT:    lock andl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_and_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection227:
+; HASWELL-O3-NEXT:    lock andl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i32 42 acq_rel, align 4, !pcsections !0
@@ -6555,6 +7908,14 @@ define void @atomic32_or_acq_rel(ptr %a) {
 ; O3-NEXT:    lock orl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_or_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection228:
+; HASWELL-O3-NEXT:    lock orl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i32 42 acq_rel, align 4, !pcsections !0
@@ -6594,6 +7955,14 @@ define void @atomic32_xor_acq_rel(ptr %a) {
 ; O3-NEXT:    lock xorl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xor_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection229:
+; HASWELL-O3-NEXT:    lock xorl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i32 42 acq_rel, align 4, !pcsections !0
@@ -6694,6 +8063,27 @@ define void @atomic32_nand_acq_rel(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_nand_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection230:
+; HASWELL-O3-NEXT:    movl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB133_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection231:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection232:
+; HASWELL-O3-NEXT:    orl $-43, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection233:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection234:
+; HASWELL-O3-NEXT:    jne .LBB133_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i32 42 acq_rel, align 4, !pcsections !0
@@ -6737,6 +8127,15 @@ define void @atomic32_xchg_seq_cst(ptr %a) {
 ; O3-NEXT:    xchgl %eax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xchg_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection235:
+; HASWELL-O3-NEXT:    xchgl %eax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i32 42 seq_cst, align 4, !pcsections !0
@@ -6776,6 +8175,14 @@ define void @atomic32_add_seq_cst(ptr %a) {
 ; O3-NEXT:    lock addl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_add_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection236:
+; HASWELL-O3-NEXT:    lock addl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i32 42 seq_cst, align 4, !pcsections !0
@@ -6815,6 +8222,14 @@ define void @atomic32_sub_seq_cst(ptr %a) {
 ; O3-NEXT:    lock subl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_sub_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection237:
+; HASWELL-O3-NEXT:    lock subl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i32 42 seq_cst, align 4, !pcsections !0
@@ -6854,6 +8269,14 @@ define void @atomic32_and_seq_cst(ptr %a) {
 ; O3-NEXT:    lock andl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_and_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection238:
+; HASWELL-O3-NEXT:    lock andl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i32 42 seq_cst, align 4, !pcsections !0
@@ -6893,6 +8316,14 @@ define void @atomic32_or_seq_cst(ptr %a) {
 ; O3-NEXT:    lock orl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_or_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection239:
+; HASWELL-O3-NEXT:    lock orl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i32 42 seq_cst, align 4, !pcsections !0
@@ -6932,6 +8363,14 @@ define void @atomic32_xor_seq_cst(ptr %a) {
 ; O3-NEXT:    lock xorl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xor_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection240:
+; HASWELL-O3-NEXT:    lock xorl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i32 42 seq_cst, align 4, !pcsections !0
@@ -7032,6 +8471,27 @@ define void @atomic32_nand_seq_cst(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_nand_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection241:
+; HASWELL-O3-NEXT:    movl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB140_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection242:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection243:
+; HASWELL-O3-NEXT:    orl $-43, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection244:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection245:
+; HASWELL-O3-NEXT:    jne .LBB140_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i32 42 seq_cst, align 4, !pcsections !0
@@ -7117,6 +8577,25 @@ define void @atomic32_cas_monotonic(ptr %a) {
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_cas_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection246:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection247:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection248:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection249:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection250:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection251:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i32 42, i32 1 monotonic monotonic, align 4, !pcsections !0
@@ -7204,6 +8683,25 @@ define void @atomic32_cas_acquire(ptr %a) {
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_cas_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection252:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection253:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection254:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection255:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection256:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection257:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i32 42, i32 1 acquire monotonic, align 4, !pcsections !0
@@ -7291,6 +8789,25 @@ define void @atomic32_cas_release(ptr %a) {
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_cas_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection258:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection259:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection260:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection261:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection262:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection263:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i32 42, i32 1 release monotonic, align 4, !pcsections !0
@@ -7378,6 +8895,25 @@ define void @atomic32_cas_acq_rel(ptr %a) {
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_cas_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection264:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection265:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection266:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection267:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection268:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection269:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i32 42, i32 1 acq_rel monotonic, align 4, !pcsections !0
@@ -7465,6 +9001,25 @@ define void @atomic32_cas_seq_cst(ptr %a) {
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_cas_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection270:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection271:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection272:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection273:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection274:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection275:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i32 42, i32 1 seq_cst monotonic, align 4, !pcsections !0
@@ -7506,6 +9061,14 @@ define i64 @atomic64_load_unordered(ptr %a) {
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_load_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection276:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i64, ptr %a unordered, align 8, !pcsections !0
@@ -7545,6 +9108,14 @@ define i64 @atomic64_load_monotonic(ptr %a) {
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_load_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection277:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i64, ptr %a monotonic, align 8, !pcsections !0
@@ -7584,6 +9155,14 @@ define i64 @atomic64_load_acquire(ptr %a) {
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_load_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection278:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i64, ptr %a acquire, align 8, !pcsections !0
@@ -7623,6 +9202,14 @@ define i64 @atomic64_load_seq_cst(ptr %a) {
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_load_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection279:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i64, ptr %a seq_cst, align 8, !pcsections !0
@@ -7662,6 +9249,14 @@ define ptr @atomic64_load_seq_cst_ptr_ty(ptr %a) {
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_load_seq_cst_ptr_ty:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection280:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic ptr, ptr %a seq_cst, align 8, !pcsections !0
@@ -7701,6 +9296,14 @@ define void @atomic64_store_unordered(ptr %a) {
 ; O3-NEXT:    movq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_store_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection281:
+; HASWELL-O3-NEXT:    movq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i64 42, ptr %a unordered, align 8, !pcsections !0
@@ -7740,6 +9343,14 @@ define void @atomic64_store_monotonic(ptr %a) {
 ; O3-NEXT:    movq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_store_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection282:
+; HASWELL-O3-NEXT:    movq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i64 42, ptr %a monotonic, align 8, !pcsections !0
@@ -7779,6 +9390,14 @@ define void @atomic64_store_release(ptr %a) {
 ; O3-NEXT:    movq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_store_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection283:
+; HASWELL-O3-NEXT:    movq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i64 42, ptr %a release, align 8, !pcsections !0
@@ -7822,6 +9441,15 @@ define void @atomic64_store_seq_cst(ptr %a) {
 ; O3-NEXT:    xchgq %rax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_store_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection284:
+; HASWELL-O3-NEXT:    xchgq %rax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i64 42, ptr %a seq_cst, align 8, !pcsections !0
@@ -7861,6 +9489,14 @@ define void @atomic64_store_seq_cst_ptr_ty(ptr %a, ptr %v) {
 ; O3-NEXT:    xchgq %rsi, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_store_seq_cst_ptr_ty:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection285:
+; HASWELL-O3-NEXT:    xchgq %rsi, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic ptr %v, ptr %a seq_cst, align 8, !pcsections !0
@@ -7904,6 +9540,15 @@ define void @atomic64_xchg_monotonic(ptr %a) {
 ; O3-NEXT:    xchgq %rax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xchg_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection286:
+; HASWELL-O3-NEXT:    xchgq %rax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i64 42 monotonic, align 8, !pcsections !0
@@ -7943,6 +9588,14 @@ define void @atomic64_add_monotonic(ptr %a) {
 ; O3-NEXT:    lock addq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_add_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection287:
+; HASWELL-O3-NEXT:    lock addq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i64 42 monotonic, align 8, !pcsections !0
@@ -7982,6 +9635,14 @@ define void @atomic64_sub_monotonic(ptr %a) {
 ; O3-NEXT:    lock subq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_sub_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection288:
+; HASWELL-O3-NEXT:    lock subq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i64 42 monotonic, align 8, !pcsections !0
@@ -8021,6 +9682,14 @@ define void @atomic64_and_monotonic(ptr %a) {
 ; O3-NEXT:    lock andq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_and_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection289:
+; HASWELL-O3-NEXT:    lock andq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i64 42 monotonic, align 8, !pcsections !0
@@ -8060,6 +9729,14 @@ define void @atomic64_or_monotonic(ptr %a) {
 ; O3-NEXT:    lock orq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_or_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection290:
+; HASWELL-O3-NEXT:    lock orq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i64 42 monotonic, align 8, !pcsections !0
@@ -8099,6 +9776,14 @@ define void @atomic64_xor_monotonic(ptr %a) {
 ; O3-NEXT:    lock xorq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xor_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection291:
+; HASWELL-O3-NEXT:    lock xorq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i64 42 monotonic, align 8, !pcsections !0
@@ -8202,6 +9887,27 @@ define void @atomic64_nand_monotonic(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_nand_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection292:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB162_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection293:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection294:
+; HASWELL-O3-NEXT:    orq $-43, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection295:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection296:
+; HASWELL-O3-NEXT:    jne .LBB162_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i64 42 monotonic, align 8, !pcsections !0
@@ -8245,6 +9951,15 @@ define void @atomic64_xchg_acquire(ptr %a) {
 ; O3-NEXT:    xchgq %rax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xchg_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection297:
+; HASWELL-O3-NEXT:    xchgq %rax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i64 42 acquire, align 8, !pcsections !0
@@ -8284,6 +9999,14 @@ define void @atomic64_add_acquire(ptr %a) {
 ; O3-NEXT:    lock addq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_add_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection298:
+; HASWELL-O3-NEXT:    lock addq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i64 42 acquire, align 8, !pcsections !0
@@ -8323,6 +10046,14 @@ define void @atomic64_sub_acquire(ptr %a) {
 ; O3-NEXT:    lock subq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_sub_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection299:
+; HASWELL-O3-NEXT:    lock subq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i64 42 acquire, align 8, !pcsections !0
@@ -8362,6 +10093,14 @@ define void @atomic64_and_acquire(ptr %a) {
 ; O3-NEXT:    lock andq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_and_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection300:
+; HASWELL-O3-NEXT:    lock andq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i64 42 acquire, align 8, !pcsections !0
@@ -8401,6 +10140,14 @@ define void @atomic64_or_acquire(ptr %a) {
 ; O3-NEXT:    lock orq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_or_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection301:
+; HASWELL-O3-NEXT:    lock orq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i64 42 acquire, align 8, !pcsections !0
@@ -8440,6 +10187,14 @@ define void @atomic64_xor_acquire(ptr %a) {
 ; O3-NEXT:    lock xorq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xor_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection302:
+; HASWELL-O3-NEXT:    lock xorq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i64 42 acquire, align 8, !pcsections !0
@@ -8543,6 +10298,27 @@ define void @atomic64_nand_acquire(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_nand_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection303:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB169_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection304:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection305:
+; HASWELL-O3-NEXT:    orq $-43, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection306:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection307:
+; HASWELL-O3-NEXT:    jne .LBB169_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i64 42 acquire, align 8, !pcsections !0
@@ -8586,6 +10362,15 @@ define void @atomic64_xchg_release(ptr %a) {
 ; O3-NEXT:    xchgq %rax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xchg_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection308:
+; HASWELL-O3-NEXT:    xchgq %rax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i64 42 release, align 8, !pcsections !0
@@ -8625,6 +10410,14 @@ define void @atomic64_add_release(ptr %a) {
 ; O3-NEXT:    lock addq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_add_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection309:
+; HASWELL-O3-NEXT:    lock addq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i64 42 release, align 8, !pcsections !0
@@ -8664,6 +10457,14 @@ define void @atomic64_sub_release(ptr %a) {
 ; O3-NEXT:    lock subq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_sub_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection310:
+; HASWELL-O3-NEXT:    lock subq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i64 42 release, align 8, !pcsections !0
@@ -8703,6 +10504,14 @@ define void @atomic64_and_release(ptr %a) {
 ; O3-NEXT:    lock andq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_and_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection311:
+; HASWELL-O3-NEXT:    lock andq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i64 42 release, align 8, !pcsections !0
@@ -8742,6 +10551,14 @@ define void @atomic64_or_release(ptr %a) {
 ; O3-NEXT:    lock orq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_or_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection312:
+; HASWELL-O3-NEXT:    lock orq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i64 42 release, align 8, !pcsections !0
@@ -8781,6 +10598,14 @@ define void @atomic64_xor_release(ptr %a) {
 ; O3-NEXT:    lock xorq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xor_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection313:
+; HASWELL-O3-NEXT:    lock xorq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i64 42 release, align 8, !pcsections !0
@@ -8884,6 +10709,27 @@ define void @atomic64_nand_release(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_nand_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection314:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB176_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection315:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection316:
+; HASWELL-O3-NEXT:    orq $-43, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection317:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection318:
+; HASWELL-O3-NEXT:    jne .LBB176_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i64 42 release, align 8, !pcsections !0
@@ -8927,6 +10773,15 @@ define void @atomic64_xchg_acq_rel(ptr %a) {
 ; O3-NEXT:    xchgq %rax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xchg_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection319:
+; HASWELL-O3-NEXT:    xchgq %rax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i64 42 acq_rel, align 8, !pcsections !0
@@ -8966,6 +10821,14 @@ define void @atomic64_add_acq_rel(ptr %a) {
 ; O3-NEXT:    lock addq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_add_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection320:
+; HASWELL-O3-NEXT:    lock addq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i64 42 acq_rel, align 8, !pcsections !0
@@ -9005,6 +10868,14 @@ define void @atomic64_sub_acq_rel(ptr %a) {
 ; O3-NEXT:    lock subq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_sub_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection321:
+; HASWELL-O3-NEXT:    lock subq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i64 42 acq_rel, align 8, !pcsections !0
@@ -9044,6 +10915,14 @@ define void @atomic64_and_acq_rel(ptr %a) {
 ; O3-NEXT:    lock andq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_and_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection322:
+; HASWELL-O3-NEXT:    lock andq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i64 42 acq_rel, align 8, !pcsections !0
@@ -9083,6 +10962,14 @@ define void @atomic64_or_acq_rel(ptr %a) {
 ; O3-NEXT:    lock orq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_or_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection323:
+; HASWELL-O3-NEXT:    lock orq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i64 42 acq_rel, align 8, !pcsections !0
@@ -9122,6 +11009,14 @@ define void @atomic64_xor_acq_rel(ptr %a) {
 ; O3-NEXT:    lock xorq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xor_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection324:
+; HASWELL-O3-NEXT:    lock xorq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i64 42 acq_rel, align 8, !pcsections !0
@@ -9225,6 +11120,27 @@ define void @atomic64_nand_acq_rel(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_nand_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection325:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB183_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection326:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection327:
+; HASWELL-O3-NEXT:    orq $-43, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection328:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection329:
+; HASWELL-O3-NEXT:    jne .LBB183_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i64 42 acq_rel, align 8, !pcsections !0
@@ -9268,6 +11184,15 @@ define void @atomic64_xchg_seq_cst(ptr %a) {
 ; O3-NEXT:    xchgq %rax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xchg_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection330:
+; HASWELL-O3-NEXT:    xchgq %rax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i64 42 seq_cst, align 8, !pcsections !0
@@ -9307,6 +11232,14 @@ define void @atomic64_add_seq_cst(ptr %a) {
 ; O3-NEXT:    lock addq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_add_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection331:
+; HASWELL-O3-NEXT:    lock addq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i64 42 seq_cst, align 8, !pcsections !0
@@ -9346,6 +11279,14 @@ define void @atomic64_sub_seq_cst(ptr %a) {
 ; O3-NEXT:    lock subq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_sub_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection332:
+; HASWELL-O3-NEXT:    lock subq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i64 42 seq_cst, align 8, !pcsections !0
@@ -9385,6 +11326,14 @@ define void @atomic64_and_seq_cst(ptr %a) {
 ; O3-NEXT:    lock andq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_and_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection333:
+; HASWELL-O3-NEXT:    lock andq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i64 42 seq_cst, align 8, !pcsections !0
@@ -9424,6 +11373,14 @@ define void @atomic64_or_seq_cst(ptr %a) {
 ; O3-NEXT:    lock orq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_or_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection334:
+; HASWELL-O3-NEXT:    lock orq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i64 42 seq_cst, align 8, !pcsections !0
@@ -9463,6 +11420,14 @@ define void @atomic64_xor_seq_cst(ptr %a) {
 ; O3-NEXT:    lock xorq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xor_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection335:
+; HASWELL-O3-NEXT:    lock xorq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i64 42 seq_cst, align 8, !pcsections !0
@@ -9566,6 +11531,27 @@ define void @atomic64_nand_seq_cst(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_nand_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection336:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB190_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection337:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection338:
+; HASWELL-O3-NEXT:    orq $-43, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection339:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection340:
+; HASWELL-O3-NEXT:    jne .LBB190_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i64 42 seq_cst, align 8, !pcsections !0
@@ -9651,6 +11637,25 @@ define void @atomic64_cas_monotonic(ptr %a) {
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_cas_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection341:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection342:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection343:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection344:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection345:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection346:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i64 42, i64 1 monotonic monotonic, align 8, !pcsections !0
@@ -9738,6 +11743,25 @@ define void @atomic64_cas_acquire(ptr %a) {
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_cas_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection347:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection348:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection349:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection350:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection351:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection352:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i64 42, i64 1 acquire monotonic, align 8, !pcsections !0
@@ -9825,6 +11849,25 @@ define void @atomic64_cas_release(ptr %a) {
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_cas_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection353:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection354:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection355:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection356:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection357:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection358:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i64 42, i64 1 release monotonic, align 8, !pcsections !0
@@ -9912,6 +11955,25 @@ define void @atomic64_cas_acq_rel(ptr %a) {
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_cas_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection359:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection360:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection361:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection362:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection363:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection364:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i64 42, i64 1 acq_rel monotonic, align 8, !pcsections !0
@@ -9999,6 +12061,25 @@ define void @atomic64_cas_seq_cst(ptr %a) {
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O3-NEXT:    movq $3, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_cas_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection365:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection366:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection367:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection368:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection369:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection370:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:    movq $3, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i64 42, i64 1 seq_cst monotonic, align 8, !pcsections !0
@@ -10044,6 +12125,15 @@ define void @atomic64_cas_seq_cst_ptr_ty(ptr %a, ptr %v1, ptr %v2) {
 ; O3-NEXT:    lock cmpxchgq %rdx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_cas_seq_cst_ptr_ty:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq %rsi, %rax
+; HASWELL-O3-NEXT:    movq foo(%rip), %rcx
+; HASWELL-O3-NEXT:  .Lpcsection371:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rdx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, ptr %v1, ptr %v2 seq_cst seq_cst, align 8, !pcsections !0
@@ -10102,6 +12192,18 @@ define i64 @atomic_use_cond(ptr %a) {
 ; O3-NEXT:  .LBB197_2: # %else
 ; O3-NEXT:    movl $2, %eax
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic_use_cond:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:  .Lpcsection372:
+; HASWELL-O3-NEXT:    lock decq (%rdi)
+; HASWELL-O3-NEXT:    jne .LBB197_2
+; HASWELL-O3-NEXT:  # %bb.1: # %then
+; HASWELL-O3-NEXT:    movl $1, %eax
+; HASWELL-O3-NEXT:    retq
+; HASWELL-O3-NEXT:  .LBB197_2: # %else
+; HASWELL-O3-NEXT:    movl $2, %eax
+; HASWELL-O3-NEXT:    retq
 entry:
   %x = atomicrmw sub ptr %a, i64 1 seq_cst, align 8, !pcsections !0
   %y = icmp eq i64 %x, 1
@@ -10196,6 +12298,18 @@ define i128 @atomic128_load_unordered(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_load_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection373:
+; HASWELL-O3-NEXT:    vmovdqa (%rdi), %xmm0
+; HASWELL-O3-NEXT:  .Lpcsection374:
+; HASWELL-O3-NEXT:    vmovq %xmm0, %rax
+; HASWELL-O3-NEXT:  .Lpcsection375:
+; HASWELL-O3-NEXT:    vpextrq $1, %xmm0, %rdx
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i128, ptr %a unordered, align 16, !pcsections !0
@@ -10285,6 +12399,18 @@ define i128 @atomic128_load_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_load_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection376:
+; HASWELL-O3-NEXT:    vmovdqa (%rdi), %xmm0
+; HASWELL-O3-NEXT:  .Lpcsection377:
+; HASWELL-O3-NEXT:    vmovq %xmm0, %rax
+; HASWELL-O3-NEXT:  .Lpcsection378:
+; HASWELL-O3-NEXT:    vpextrq $1, %xmm0, %rdx
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i128, ptr %a monotonic, align 16, !pcsections !0
@@ -10374,6 +12500,18 @@ define i128 @atomic128_load_acquire(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_load_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection379:
+; HASWELL-O3-NEXT:    vmovdqa (%rdi), %xmm0
+; HASWELL-O3-NEXT:  .Lpcsection380:
+; HASWELL-O3-NEXT:    vmovq %xmm0, %rax
+; HASWELL-O3-NEXT:  .Lpcsection381:
+; HASWELL-O3-NEXT:    vpextrq $1, %xmm0, %rdx
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i128, ptr %a acquire, align 16, !pcsections !0
@@ -10463,6 +12601,18 @@ define i128 @atomic128_load_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_load_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection382:
+; HASWELL-O3-NEXT:    vmovdqa (%rdi), %xmm0
+; HASWELL-O3-NEXT:  .Lpcsection383:
+; HASWELL-O3-NEXT:    vmovq %xmm0, %rax
+; HASWELL-O3-NEXT:  .Lpcsection384:
+; HASWELL-O3-NEXT:    vpextrq $1, %xmm0, %rdx
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i128, ptr %a seq_cst, align 16, !pcsections !0
@@ -10502,6 +12652,14 @@ define ptr @atomic128_load_seq_cst_ptr_ty(ptr %a) {
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_load_seq_cst_ptr_ty:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection385:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic ptr, ptr %a seq_cst, align 16, !pcsections !0
@@ -10629,6 +12787,16 @@ define void @atomic128_store_unordered(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_store_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection386:
+; HASWELL-O3-NEXT:    vmovss {{.*#+}} xmm0 = [42,0,0,0]
+; HASWELL-O3-NEXT:  .Lpcsection387:
+; HASWELL-O3-NEXT:    vmovaps %xmm0, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i128 42, ptr %a unordered, align 16, !pcsections !0
@@ -10756,6 +12924,16 @@ define void @atomic128_store_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_store_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection388:
+; HASWELL-O3-NEXT:    vmovss {{.*#+}} xmm0 = [42,0,0,0]
+; HASWELL-O3-NEXT:  .Lpcsection389:
+; HASWELL-O3-NEXT:    vmovaps %xmm0, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i128 42, ptr %a monotonic, align 16, !pcsections !0
@@ -10883,6 +13061,16 @@ define void @atomic128_store_release(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_store_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection390:
+; HASWELL-O3-NEXT:    vmovss {{.*#+}} xmm0 = [42,0,0,0]
+; HASWELL-O3-NEXT:  .Lpcsection391:
+; HASWELL-O3-NEXT:    vmovaps %xmm0, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i128 42, ptr %a release, align 16, !pcsections !0
@@ -11010,6 +13198,18 @@ define void @atomic128_store_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_store_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection392:
+; HASWELL-O3-NEXT:    vmovss {{.*#+}} xmm0 = [42,0,0,0]
+; HASWELL-O3-NEXT:  .Lpcsection393:
+; HASWELL-O3-NEXT:    vmovaps %xmm0, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection394:
+; HASWELL-O3-NEXT:    lock orl $0, -{{[0-9]+}}(%rsp)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i128 42, ptr %a seq_cst, align 16, !pcsections !0
@@ -11049,6 +13249,14 @@ define void @atomic128_store_seq_cst_ptr_ty(ptr %a, ptr %v) {
 ; O3-NEXT:    xchgq %rsi, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_store_seq_cst_ptr_ty:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection395:
+; HASWELL-O3-NEXT:    xchgq %rsi, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic ptr %v, ptr %a seq_cst, align 16, !pcsections !0
@@ -11176,6 +13384,33 @@ define void @atomic128_xchg_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xchg_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection396:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection397:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection398:
+; HASWELL-O3-NEXT:    movl $42, %ebx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB208_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:  .Lpcsection399:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection400:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection401:
+; HASWELL-O3-NEXT:    jne .LBB208_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i128 42 monotonic, align 16, !pcsections !0
@@ -11309,6 +13544,35 @@ define void @atomic128_add_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_add_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection402:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection403:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB209_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection404:
+; HASWELL-O3-NEXT:    addq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection405:
+; HASWELL-O3-NEXT:    adcq $0, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection406:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection407:
+; HASWELL-O3-NEXT:    jne .LBB209_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i128 42 monotonic, align 16, !pcsections !0
@@ -11442,6 +13706,35 @@ define void @atomic128_sub_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_sub_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection408:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection409:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB210_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection410:
+; HASWELL-O3-NEXT:    addq $-42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection411:
+; HASWELL-O3-NEXT:    adcq $-1, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection412:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection413:
+; HASWELL-O3-NEXT:    jne .LBB210_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i128 42 monotonic, align 16, !pcsections !0
@@ -11574,6 +13867,34 @@ define void @atomic128_and_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_and_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection414:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection415:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB211_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection416:
+; HASWELL-O3-NEXT:    andl $42, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection417:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection418:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection419:
+; HASWELL-O3-NEXT:    jne .LBB211_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i128 42 monotonic, align 16, !pcsections !0
@@ -11699,6 +14020,33 @@ define void @atomic128_or_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_or_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection420:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection421:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB212_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection422:
+; HASWELL-O3-NEXT:    orq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection423:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection424:
+; HASWELL-O3-NEXT:    jne .LBB212_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i128 42 monotonic, align 16, !pcsections !0
@@ -11824,6 +14172,33 @@ define void @atomic128_xor_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xor_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection425:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection426:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB213_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection427:
+; HASWELL-O3-NEXT:    xorq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection428:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection429:
+; HASWELL-O3-NEXT:    jne .LBB213_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i128 42 monotonic, align 16, !pcsections !0
@@ -11964,6 +14339,36 @@ define void @atomic128_nand_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_nand_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection430:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection431:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection432:
+; HASWELL-O3-NEXT:    movq $-1, %rcx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB214_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection433:
+; HASWELL-O3-NEXT:    notl %ebx
+; HASWELL-O3-NEXT:  .Lpcsection434:
+; HASWELL-O3-NEXT:    orq $-43, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection435:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection436:
+; HASWELL-O3-NEXT:    jne .LBB214_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i128 42 monotonic, align 16, !pcsections !0
@@ -12091,6 +14496,33 @@ define void @atomic128_xchg_acquire(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xchg_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection437:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection438:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection439:
+; HASWELL-O3-NEXT:    movl $42, %ebx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB215_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:  .Lpcsection440:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection441:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection442:
+; HASWELL-O3-NEXT:    jne .LBB215_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i128 42 acquire, align 16, !pcsections !0
@@ -12224,6 +14656,35 @@ define void @atomic128_add_acquire(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_add_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection443:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection444:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB216_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection445:
+; HASWELL-O3-NEXT:    addq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection446:
+; HASWELL-O3-NEXT:    adcq $0, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection447:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection448:
+; HASWELL-O3-NEXT:    jne .LBB216_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i128 42 acquire, align 16, !pcsections !0
@@ -12357,6 +14818,35 @@ define void @atomic128_sub_acquire(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_sub_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection449:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection450:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB217_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection451:
+; HASWELL-O3-NEXT:    addq $-42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection452:
+; HASWELL-O3-NEXT:    adcq $-1, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection453:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection454:
+; HASWELL-O3-NEXT:    jne .LBB217_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i128 42 acquire, align 16, !pcsections !0
@@ -12489,6 +14979,34 @@ define void @atomic128_and_acquire(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_and_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection455:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection456:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB218_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection457:
+; HASWELL-O3-NEXT:    andl $42, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection458:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection459:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection460:
+; HASWELL-O3-NEXT:    jne .LBB218_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i128 42 acquire, align 16, !pcsections !0
@@ -12614,6 +15132,33 @@ define void @atomic128_or_acquire(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_or_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection461:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection462:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB219_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection463:
+; HASWELL-O3-NEXT:    orq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection464:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection465:
+; HASWELL-O3-NEXT:    jne .LBB219_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i128 42 acquire, align 16, !pcsections !0
@@ -12739,6 +15284,33 @@ define void @atomic128_xor_acquire(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xor_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection466:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection467:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB220_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection468:
+; HASWELL-O3-NEXT:    xorq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection469:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection470:
+; HASWELL-O3-NEXT:    jne .LBB220_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i128 42 acquire, align 16, !pcsections !0
@@ -12879,6 +15451,36 @@ define void @atomic128_nand_acquire(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_nand_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection471:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection472:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection473:
+; HASWELL-O3-NEXT:    movq $-1, %rcx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB221_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection474:
+; HASWELL-O3-NEXT:    notl %ebx
+; HASWELL-O3-NEXT:  .Lpcsection475:
+; HASWELL-O3-NEXT:    orq $-43, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection476:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection477:
+; HASWELL-O3-NEXT:    jne .LBB221_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i128 42 acquire, align 16, !pcsections !0
@@ -13006,6 +15608,33 @@ define void @atomic128_xchg_release(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xchg_release:
+; HASWELL-O3:       # %bb.0:
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection478:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection479:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection480:
+; HASWELL-O3-NEXT:    movl $42, %ebx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB222_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:  .Lpcsection481:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection482:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection483:
+; HASWELL-O3-NEXT:    jne .LBB222_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i128 42 release, align 16, !pcsections !0
   store volatile i64 1, ptr @foo, align 8
@@ -13138,6 +15767,35 @@ define void @atomic128_add_release(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_add_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection484:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection485:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB223_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection486:
+; HASWELL-O3-NEXT:    addq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection487:
+; HASWELL-O3-NEXT:    adcq $0, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection488:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection489:
+; HASWELL-O3-NEXT:    jne .LBB223_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i128 42 release, align 16, !pcsections !0
@@ -13271,6 +15929,35 @@ define void @atomic128_sub_release(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_sub_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection490:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection491:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB224_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection492:
+; HASWELL-O3-NEXT:    addq $-42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection493:
+; HASWELL-O3-NEXT:    adcq $-1, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection494:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection495:
+; HASWELL-O3-NEXT:    jne .LBB224_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i128 42 release, align 16, !pcsections !0
@@ -13403,6 +16090,34 @@ define void @atomic128_and_release(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_and_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection496:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection497:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB225_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection498:
+; HASWELL-O3-NEXT:    andl $42, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection499:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection500:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection501:
+; HASWELL-O3-NEXT:    jne .LBB225_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i128 42 release, align 16, !pcsections !0
@@ -13528,6 +16243,33 @@ define void @atomic128_or_release(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_or_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection502:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection503:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB226_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection504:
+; HASWELL-O3-NEXT:    orq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection505:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection506:
+; HASWELL-O3-NEXT:    jne .LBB226_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i128 42 release, align 16, !pcsections !0
@@ -13653,6 +16395,33 @@ define void @atomic128_xor_release(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xor_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection507:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection508:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB227_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection509:
+; HASWELL-O3-NEXT:    xorq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection510:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection511:
+; HASWELL-O3-NEXT:    jne .LBB227_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i128 42 release, align 16, !pcsections !0
@@ -13793,6 +16562,36 @@ define void @atomic128_nand_release(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_nand_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection512:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection513:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection514:
+; HASWELL-O3-NEXT:    movq $-1, %rcx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB228_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection515:
+; HASWELL-O3-NEXT:    notl %ebx
+; HASWELL-O3-NEXT:  .Lpcsection516:
+; HASWELL-O3-NEXT:    orq $-43, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection517:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection518:
+; HASWELL-O3-NEXT:    jne .LBB228_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i128 42 release, align 16, !pcsections !0
@@ -13920,6 +16719,33 @@ define void @atomic128_xchg_acq_rel(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xchg_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection519:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection520:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection521:
+; HASWELL-O3-NEXT:    movl $42, %ebx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB229_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:  .Lpcsection522:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection523:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection524:
+; HASWELL-O3-NEXT:    jne .LBB229_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i128 42 acq_rel, align 16, !pcsections !0
@@ -14053,6 +16879,35 @@ define void @atomic128_add_acq_rel(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_add_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection525:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection526:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB230_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection527:
+; HASWELL-O3-NEXT:    addq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection528:
+; HASWELL-O3-NEXT:    adcq $0, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection529:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection530:
+; HASWELL-O3-NEXT:    jne .LBB230_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i128 42 acq_rel, align 16, !pcsections !0
@@ -14186,6 +17041,35 @@ define void @atomic128_sub_acq_rel(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_sub_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection531:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection532:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB231_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection533:
+; HASWELL-O3-NEXT:    addq $-42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection534:
+; HASWELL-O3-NEXT:    adcq $-1, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection535:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection536:
+; HASWELL-O3-NEXT:    jne .LBB231_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i128 42 acq_rel, align 16, !pcsections !0
@@ -14318,6 +17202,34 @@ define void @atomic128_and_acq_rel(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_and_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection537:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection538:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB232_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection539:
+; HASWELL-O3-NEXT:    andl $42, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection540:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection541:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection542:
+; HASWELL-O3-NEXT:    jne .LBB232_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i128 42 acq_rel, align 16, !pcsections !0
@@ -14443,6 +17355,33 @@ define void @atomic128_or_acq_rel(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_or_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection543:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection544:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB233_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection545:
+; HASWELL-O3-NEXT:    orq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection546:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection547:
+; HASWELL-O3-NEXT:    jne .LBB233_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i128 42 acq_rel, align 16, !pcsections !0
@@ -14568,6 +17507,33 @@ define void @atomic128_xor_acq_rel(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xor_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection548:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection549:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB234_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection550:
+; HASWELL-O3-NEXT:    xorq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection551:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection552:
+; HASWELL-O3-NEXT:    jne .LBB234_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i128 42 acq_rel, align 16, !pcsections !0
@@ -14708,6 +17674,36 @@ define void @atomic128_nand_acq_rel(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_nand_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection553:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection554:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection555:
+; HASWELL-O3-NEXT:    movq $-1, %rcx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB235_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection556:
+; HASWELL-O3-NEXT:    notl %ebx
+; HASWELL-O3-NEXT:  .Lpcsection557:
+; HASWELL-O3-NEXT:    orq $-43, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection558:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection559:
+; HASWELL-O3-NEXT:    jne .LBB235_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i128 42 acq_rel, align 16, !pcsections !0
@@ -14835,6 +17831,33 @@ define void @atomic128_xchg_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xchg_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection560:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection561:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection562:
+; HASWELL-O3-NEXT:    movl $42, %ebx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB236_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:  .Lpcsection563:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection564:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection565:
+; HASWELL-O3-NEXT:    jne .LBB236_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i128 42 seq_cst, align 16, !pcsections !0
@@ -14968,6 +17991,35 @@ define void @atomic128_add_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_add_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection566:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection567:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB237_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection568:
+; HASWELL-O3-NEXT:    addq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection569:
+; HASWELL-O3-NEXT:    adcq $0, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection570:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection571:
+; HASWELL-O3-NEXT:    jne .LBB237_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i128 42 seq_cst, align 16, !pcsections !0
@@ -15101,6 +18153,35 @@ define void @atomic128_sub_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_sub_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection572:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection573:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB238_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection574:
+; HASWELL-O3-NEXT:    addq $-42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection575:
+; HASWELL-O3-NEXT:    adcq $-1, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection576:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection577:
+; HASWELL-O3-NEXT:    jne .LBB238_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i128 42 seq_cst, align 16, !pcsections !0
@@ -15233,6 +18314,34 @@ define void @atomic128_and_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_and_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection578:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection579:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB239_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection580:
+; HASWELL-O3-NEXT:    andl $42, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection581:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection582:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection583:
+; HASWELL-O3-NEXT:    jne .LBB239_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i128 42 seq_cst, align 16, !pcsections !0
@@ -15358,6 +18467,33 @@ define void @atomic128_or_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_or_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection584:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection585:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB240_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection586:
+; HASWELL-O3-NEXT:    orq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection587:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection588:
+; HASWELL-O3-NEXT:    jne .LBB240_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i128 42 seq_cst, align 16, !pcsections !0
@@ -15483,6 +18619,33 @@ define void @atomic128_xor_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xor_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection589:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection590:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB241_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection591:
+; HASWELL-O3-NEXT:    xorq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection592:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection593:
+; HASWELL-O3-NEXT:    jne .LBB241_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i128 42 seq_cst, align 16, !pcsections !0
@@ -15623,6 +18786,36 @@ define void @atomic128_nand_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_nand_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection594:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection595:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection596:
+; HASWELL-O3-NEXT:    movq $-1, %rcx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB242_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection597:
+; HASWELL-O3-NEXT:    notl %ebx
+; HASWELL-O3-NEXT:  .Lpcsection598:
+; HASWELL-O3-NEXT:    orq $-43, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection599:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection600:
+; HASWELL-O3-NEXT:    jne .LBB242_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i128 42 seq_cst, align 16, !pcsections !0
@@ -15781,6 +18974,43 @@ define void @atomic128_cas_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_cas_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection601:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection602:
+; HASWELL-O3-NEXT:    movl $1, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection603:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection604:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection605:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection606:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection607:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection608:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection609:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection610:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection611:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection612:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection613:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i128 42, i128 1 monotonic monotonic, align 16, !pcsections !0
@@ -15941,6 +19171,43 @@ define void @atomic128_cas_acquire(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_cas_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection614:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection615:
+; HASWELL-O3-NEXT:    movl $1, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection616:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection617:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection618:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection619:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection620:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection621:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection622:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection623:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection624:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection625:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection626:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i128 42, i128 1 acquire monotonic, align 16, !pcsections !0
@@ -16101,6 +19368,43 @@ define void @atomic128_cas_release(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_cas_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection627:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection628:
+; HASWELL-O3-NEXT:    movl $1, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection629:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection630:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection631:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection632:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection633:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection634:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection635:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection636:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection637:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection638:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection639:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i128 42, i128 1 release monotonic, align 16, !pcsections !0
@@ -16261,6 +19565,43 @@ define void @atomic128_cas_acq_rel(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_cas_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection640:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection641:
+; HASWELL-O3-NEXT:    movl $1, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection642:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection643:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection644:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection645:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection646:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection647:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection648:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection649:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection650:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection651:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection652:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i128 42, i128 1 acq_rel monotonic, align 16, !pcsections !0
@@ -16421,6 +19762,43 @@ define void @atomic128_cas_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_cas_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection653:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection654:
+; HASWELL-O3-NEXT:    movl $1, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection655:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection656:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection657:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection658:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection659:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection660:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection661:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection662:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection663:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection664:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection665:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:    movq $3, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i128 42, i128 1 seq_cst monotonic, align 16, !pcsections !0
diff --git a/llvm/test/CodeGen/X86/peephole-copy.mir b/llvm/test/CodeGen/X86/peephole-copy.mir
index e24abf84..f399398 100644
--- a/llvm/test/CodeGen/X86/peephole-copy.mir
+++ b/llvm/test/CodeGen/X86/peephole-copy.mir
@@ -22,14 +22,14 @@ body: |
   bb.0:
     ; CHECK-LABEL: name: c
     ; CHECK: [[MOV32ri:%[0-9]+]]:gr32_abcd = MOV32ri 512
-    ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 2359305 /* reguse:GR32 */, [[MOV32ri]], 1 /* reguse */, implicit-def early-clobber $df
+    ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 3211273 /* reguse:GR32_ABCD */, [[MOV32ri]], 1 /* reguse */, implicit-def early-clobber $df
     ; CHECK-NEXT: [[MOV32ri1:%[0-9]+]]:gr32_abcd = MOV32ri 512
-    ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 2359305 /* reguse:GR32 */, [[MOV32ri1]], 1 /* reguse */, implicit-def early-clobber $df
+    ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 3211273 /* reguse:GR32_ABCD */, [[MOV32ri1]], 1 /* reguse */, implicit-def early-clobber $df
     ; CHECK-NEXT: RET 0
     %2 = MOV32ri 512
     %0 = COPY %2
-    INLINEASM &"", 1 /* sideeffect attdialect */, 2359305 /* reguse:GR32_ABCD */, %0:gr32_abcd, 1 /* clobber */, implicit-def early-clobber $df
+    INLINEASM &"", 1 /* sideeffect attdialect */, 3211273 /* reguse:GR32_ABCD */, %0:gr32_abcd, 1 /* clobber */, implicit-def early-clobber $df
     %1 = COPY %2
-    INLINEASM &"", 1 /* sideeffect attdialect */, 2359305 /* reguse:GR32_ABCD */, %1:gr32_abcd, 1 /* clobber */, implicit-def early-clobber $df
+    INLINEASM &"", 1 /* sideeffect attdialect */, 3211273 /* reguse:GR32_ABCD */, %1:gr32_abcd, 1 /* clobber */, implicit-def early-clobber $df
     RET 0
 ...
diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll
index 35c7c0e..3004b8b 100644
--- a/llvm/test/CodeGen/X86/popcnt.ll
+++ b/llvm/test/CodeGen/X86/popcnt.ll
@@ -340,84 +340,87 @@ define i64 @cnt64(i64 %x) nounwind readnone {
 define i128 @cnt128(i128 %x) nounwind readnone {
 ; X86-NOSSE-LABEL: cnt128:
 ; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl %ebx
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    pushl %edi
 ; X86-NOSSE-NEXT:    pushl %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %ebx, %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    andl $858993459, %ebx # imm = 0x33333333
-; X86-NOSSE-NEXT:    shrl $2, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %ebx, %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    shrl $4, %ebx
-; X86-NOSSE-NEXT:    addl %edi, %ebx
-; X86-NOSSE-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %ebx, %edi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %ebx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    andl $858993459, %ebx # imm = 0x33333333
+; X86-NOSSE-NEXT:    andl $-16, %esp
+; X86-NOSSE-NEXT:    movl 24(%ebp), %eax
+; X86-NOSSE-NEXT:    movl 32(%ebp), %ecx
+; X86-NOSSE-NEXT:    movl 36(%ebp), %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl %edx
+; X86-NOSSE-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
 ; X86-NOSSE-NEXT:    shrl $2, %esi
 ; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %ebx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    shrl $4, %ebx
-; X86-NOSSE-NEXT:    addl %esi, %ebx
-; X86-NOSSE-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %ebx, %esi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %esi
-; X86-NOSSE-NEXT:    addl %edi, %esi
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    shrl %edi
-; X86-NOSSE-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %edi, %edx
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X86-NOSSE-NEXT:    shrl $2, %edx
-; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %edi, %edx
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    shrl $4, %edi
-; X86-NOSSE-NEXT:    addl %edx, %edi
-; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %edi, %edx # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edx
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    shrl %edi
-; X86-NOSSE-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %edi, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl $4, %edx
+; X86-NOSSE-NEXT:    addl %esi, %edx
+; X86-NOSSE-NEXT:    movl %ecx, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %esi, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
 ; X86-NOSSE-NEXT:    shrl $2, %ecx
 ; X86-NOSSE-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %edi, %ecx
+; X86-NOSSE-NEXT:    addl %esi, %ecx
 ; X86-NOSSE-NEXT:    movl %ecx, %edi
 ; X86-NOSSE-NEXT:    shrl $4, %edi
 ; X86-NOSSE-NEXT:    addl %ecx, %edi
+; X86-NOSSE-NEXT:    movl 28(%ebp), %esi
+; X86-NOSSE-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %edx, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
 ; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
 ; X86-NOSSE-NEXT:    imull $16843009, %edi, %ecx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %ecx
 ; X86-NOSSE-NEXT:    addl %edx, %ecx
-; X86-NOSSE-NEXT:    addl %esi, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, (%eax)
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl %edx
+; X86-NOSSE-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NOSSE-NEXT:    shrl $2, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl $4, %edx
+; X86-NOSSE-NEXT:    addl %esi, %edx
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %esi, %eax
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NOSSE-NEXT:    shrl $2, %eax
+; X86-NOSSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %esi, %eax
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    shrl $4, %esi
+; X86-NOSSE-NEXT:    addl %eax, %esi
+; X86-NOSSE-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %edx, %eax # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %eax
+; X86-NOSSE-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %esi, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
+; X86-NOSSE-NEXT:    addl %eax, %edx
+; X86-NOSSE-NEXT:    addl %ecx, %edx
+; X86-NOSSE-NEXT:    movl 8(%ebp), %eax
+; X86-NOSSE-NEXT:    movl %edx, (%eax)
 ; X86-NOSSE-NEXT:    movl $0, 12(%eax)
 ; X86-NOSSE-NEXT:    movl $0, 8(%eax)
 ; X86-NOSSE-NEXT:    movl $0, 4(%eax)
+; X86-NOSSE-NEXT:    leal -8(%ebp), %esp
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %edi
-; X86-NOSSE-NEXT:    popl %ebx
+; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl $4
 ;
 ; X64-BASE-LABEL: cnt128:
@@ -462,20 +465,26 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ;
 ; X86-POPCNT-LABEL: cnt128:
 ; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    pushl %ebp
+; X86-POPCNT-NEXT:    movl %esp, %ebp
 ; X86-POPCNT-NEXT:    pushl %esi
-; X86-POPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %edx
+; X86-POPCNT-NEXT:    andl $-16, %esp
+; X86-POPCNT-NEXT:    subl $16, %esp
+; X86-POPCNT-NEXT:    movl 8(%ebp), %eax
+; X86-POPCNT-NEXT:    popcntl 36(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 32(%ebp), %edx
 ; X86-POPCNT-NEXT:    addl %ecx, %edx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %esi
+; X86-POPCNT-NEXT:    popcntl 28(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 24(%ebp), %esi
 ; X86-POPCNT-NEXT:    addl %ecx, %esi
 ; X86-POPCNT-NEXT:    addl %edx, %esi
 ; X86-POPCNT-NEXT:    movl %esi, (%eax)
 ; X86-POPCNT-NEXT:    movl $0, 12(%eax)
 ; X86-POPCNT-NEXT:    movl $0, 8(%eax)
 ; X86-POPCNT-NEXT:    movl $0, 4(%eax)
+; X86-POPCNT-NEXT:    leal -4(%ebp), %esp
 ; X86-POPCNT-NEXT:    popl %esi
+; X86-POPCNT-NEXT:    popl %ebp
 ; X86-POPCNT-NEXT:    retl $4
 ;
 ; X64-POPCNT-LABEL: cnt128:
@@ -522,7 +531,11 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ;
 ; X86-SSE2-LABEL: cnt128:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; X86-SSE2-NEXT:    psrlw $1, %xmm0
@@ -564,11 +577,17 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; X86-SSE2-NEXT:    movl $0, 12(%eax)
 ; X86-SSE2-NEXT:    movl $0, 8(%eax)
 ; X86-SSE2-NEXT:    movl $0, 4(%eax)
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl $4
 ;
 ; X86-SSSE3-LABEL: cnt128:
 ; X86-SSSE3:       # %bb.0:
-; X86-SSSE3-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSSE3-NEXT:    pushl %ebp
+; X86-SSSE3-NEXT:    movl %esp, %ebp
+; X86-SSSE3-NEXT:    andl $-16, %esp
+; X86-SSSE3-NEXT:    subl $16, %esp
+; X86-SSSE3-NEXT:    movl 8(%ebp), %eax
 ; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X86-SSSE3-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSSE3-NEXT:    movdqa %xmm2, %xmm3
@@ -600,6 +619,8 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; X86-SSSE3-NEXT:    movl $0, 12(%eax)
 ; X86-SSSE3-NEXT:    movl $0, 8(%eax)
 ; X86-SSSE3-NEXT:    movl $0, 4(%eax)
+; X86-SSSE3-NEXT:    movl %ebp, %esp
+; X86-SSSE3-NEXT:    popl %ebp
 ; X86-SSSE3-NEXT:    retl $4
   %cnt = tail call i128 @llvm.ctpop.i128(i128 %x)
   ret i128 %cnt
@@ -928,87 +949,92 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-NOSSE-LABEL: cnt128_optsize:
 ; X86-NOSSE:       # %bb.0:
 ; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    pushl %ebx
 ; X86-NOSSE-NEXT:    pushl %edi
 ; X86-NOSSE-NEXT:    pushl %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOSSE-NEXT:    movl %ebx, %ecx
-; X86-NOSSE-NEXT:    shrl %ecx
-; X86-NOSSE-NEXT:    movl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %edi, %ecx
-; X86-NOSSE-NEXT:    subl %ecx, %ebx
+; X86-NOSSE-NEXT:    andl $-16, %esp
+; X86-NOSSE-NEXT:    subl $16, %esp
+; X86-NOSSE-NEXT:    movl 32(%ebp), %edx
+; X86-NOSSE-NEXT:    movl 36(%ebp), %esi
+; X86-NOSSE-NEXT:    movl %esi, %eax
+; X86-NOSSE-NEXT:    shrl %eax
+; X86-NOSSE-NEXT:    movl $1431655765, %ecx # imm = 0x55555555
+; X86-NOSSE-NEXT:    andl %ecx, %eax
+; X86-NOSSE-NEXT:    subl %eax, %esi
 ; X86-NOSSE-NEXT:    movl $858993459, %ecx # imm = 0x33333333
-; X86-NOSSE-NEXT:    movl %ebx, %ebp
-; X86-NOSSE-NEXT:    andl %ecx, %ebp
+; X86-NOSSE-NEXT:    movl %esi, %edi
+; X86-NOSSE-NEXT:    andl %ecx, %edi
+; X86-NOSSE-NEXT:    shrl $2, %esi
+; X86-NOSSE-NEXT:    andl %ecx, %esi
+; X86-NOSSE-NEXT:    addl %edi, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edi
+; X86-NOSSE-NEXT:    shrl $4, %edi
+; X86-NOSSE-NEXT:    addl %esi, %edi
+; X86-NOSSE-NEXT:    movl %edx, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    movl $1431655765, %eax # imm = 0x55555555
+; X86-NOSSE-NEXT:    andl %eax, %esi
+; X86-NOSSE-NEXT:    subl %esi, %edx
+; X86-NOSSE-NEXT:    movl %edx, %esi
+; X86-NOSSE-NEXT:    andl %ecx, %esi
+; X86-NOSSE-NEXT:    shrl $2, %edx
+; X86-NOSSE-NEXT:    andl %ecx, %edx
+; X86-NOSSE-NEXT:    addl %esi, %edx
+; X86-NOSSE-NEXT:    movl %edx, %ebx
+; X86-NOSSE-NEXT:    shrl $4, %ebx
+; X86-NOSSE-NEXT:    addl %edx, %ebx
+; X86-NOSSE-NEXT:    movl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    andl %edx, %edi
+; X86-NOSSE-NEXT:    imull $16843009, %edi, %edi # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edi
+; X86-NOSSE-NEXT:    andl %edx, %ebx
+; X86-NOSSE-NEXT:    imull $16843009, %ebx, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
+; X86-NOSSE-NEXT:    addl %edi, %edx
+; X86-NOSSE-NEXT:    movl 28(%ebp), %ebx
+; X86-NOSSE-NEXT:    movl %ebx, %edi
+; X86-NOSSE-NEXT:    shrl %edi
+; X86-NOSSE-NEXT:    andl %eax, %edi
+; X86-NOSSE-NEXT:    subl %edi, %ebx
+; X86-NOSSE-NEXT:    movl %ebx, %edi
+; X86-NOSSE-NEXT:    andl %ecx, %edi
 ; X86-NOSSE-NEXT:    shrl $2, %ebx
 ; X86-NOSSE-NEXT:    andl %ecx, %ebx
-; X86-NOSSE-NEXT:    addl %ebp, %ebx
-; X86-NOSSE-NEXT:    movl %ebx, %ebp
-; X86-NOSSE-NEXT:    shrl $4, %ebp
-; X86-NOSSE-NEXT:    addl %ebx, %ebp
+; X86-NOSSE-NEXT:    addl %edi, %ebx
+; X86-NOSSE-NEXT:    movl %ebx, %edi
+; X86-NOSSE-NEXT:    shrl $4, %edi
+; X86-NOSSE-NEXT:    addl %ebx, %edi
+; X86-NOSSE-NEXT:    movl 24(%ebp), %eax
 ; X86-NOSSE-NEXT:    movl %eax, %ebx
 ; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl %edi, %ebx
+; X86-NOSSE-NEXT:    movl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    andl %esi, %ebx
 ; X86-NOSSE-NEXT:    subl %ebx, %eax
 ; X86-NOSSE-NEXT:    movl %eax, %ebx
 ; X86-NOSSE-NEXT:    andl %ecx, %ebx
 ; X86-NOSSE-NEXT:    shrl $2, %eax
 ; X86-NOSSE-NEXT:    andl %ecx, %eax
 ; X86-NOSSE-NEXT:    addl %ebx, %eax
-; X86-NOSSE-NEXT:    movl %eax, %edi
-; X86-NOSSE-NEXT:    shrl $4, %edi
-; X86-NOSSE-NEXT:    addl %eax, %edi
-; X86-NOSSE-NEXT:    movl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    andl %ebx, %ebp
-; X86-NOSSE-NEXT:    imull $16843009, %ebp, %eax # imm = 0x1010101
+; X86-NOSSE-NEXT:    movl %eax, %ecx
+; X86-NOSSE-NEXT:    shrl $4, %ecx
+; X86-NOSSE-NEXT:    addl %eax, %ecx
+; X86-NOSSE-NEXT:    movl $252645135, %eax # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    andl %eax, %edi
+; X86-NOSSE-NEXT:    andl %eax, %ecx
+; X86-NOSSE-NEXT:    imull $16843009, %edi, %eax # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %eax
-; X86-NOSSE-NEXT:    andl %ebx, %edi
-; X86-NOSSE-NEXT:    imull $16843009, %edi, %edi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edi
-; X86-NOSSE-NEXT:    addl %eax, %edi
-; X86-NOSSE-NEXT:    movl %esi, %eax
-; X86-NOSSE-NEXT:    shrl %eax
-; X86-NOSSE-NEXT:    movl $1431655765, %ebp # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %ebp, %eax
-; X86-NOSSE-NEXT:    subl %eax, %esi
-; X86-NOSSE-NEXT:    movl %esi, %eax
-; X86-NOSSE-NEXT:    andl %ecx, %eax
-; X86-NOSSE-NEXT:    shrl $2, %esi
-; X86-NOSSE-NEXT:    andl %ecx, %esi
-; X86-NOSSE-NEXT:    addl %eax, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebp
-; X86-NOSSE-NEXT:    shrl $4, %ebp
-; X86-NOSSE-NEXT:    addl %esi, %ebp
-; X86-NOSSE-NEXT:    movl %edx, %eax
-; X86-NOSSE-NEXT:    shrl %eax
-; X86-NOSSE-NEXT:    movl $1431655765, %esi # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %esi, %eax
-; X86-NOSSE-NEXT:    subl %eax, %edx
-; X86-NOSSE-NEXT:    movl %edx, %eax
-; X86-NOSSE-NEXT:    andl %ecx, %eax
-; X86-NOSSE-NEXT:    shrl $2, %edx
-; X86-NOSSE-NEXT:    andl %ecx, %edx
-; X86-NOSSE-NEXT:    addl %eax, %edx
-; X86-NOSSE-NEXT:    movl %edx, %eax
-; X86-NOSSE-NEXT:    shrl $4, %eax
-; X86-NOSSE-NEXT:    addl %edx, %eax
-; X86-NOSSE-NEXT:    andl %ebx, %ebp
-; X86-NOSSE-NEXT:    andl %ebx, %eax
-; X86-NOSSE-NEXT:    imull $16843009, %ebp, %ecx # imm = 0x1010101
+; X86-NOSSE-NEXT:    imull $16843009, %ecx, %ecx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %ecx
-; X86-NOSSE-NEXT:    imull $16843009, %eax, %edx # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edx
-; X86-NOSSE-NEXT:    addl %ecx, %edx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    addl %edi, %edx
-; X86-NOSSE-NEXT:    xorl %ecx, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, 12(%eax)
-; X86-NOSSE-NEXT:    movl %ecx, 8(%eax)
-; X86-NOSSE-NEXT:    movl %ecx, 4(%eax)
-; X86-NOSSE-NEXT:    movl %edx, (%eax)
+; X86-NOSSE-NEXT:    addl %eax, %ecx
+; X86-NOSSE-NEXT:    movl 8(%ebp), %eax
+; X86-NOSSE-NEXT:    addl %edx, %ecx
+; X86-NOSSE-NEXT:    xorl %edx, %edx
+; X86-NOSSE-NEXT:    movl %edx, 12(%eax)
+; X86-NOSSE-NEXT:    movl %edx, 8(%eax)
+; X86-NOSSE-NEXT:    movl %edx, 4(%eax)
+; X86-NOSSE-NEXT:    movl %ecx, (%eax)
+; X86-NOSSE-NEXT:    leal -12(%ebp), %esp
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %edi
 ; X86-NOSSE-NEXT:    popl %ebx
@@ -1057,13 +1083,17 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ;
 ; X86-POPCNT-LABEL: cnt128_optsize:
 ; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    pushl %ebp
+; X86-POPCNT-NEXT:    movl %esp, %ebp
 ; X86-POPCNT-NEXT:    pushl %esi
-; X86-POPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %edx
+; X86-POPCNT-NEXT:    andl $-16, %esp
+; X86-POPCNT-NEXT:    subl $16, %esp
+; X86-POPCNT-NEXT:    movl 8(%ebp), %eax
+; X86-POPCNT-NEXT:    popcntl 36(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 32(%ebp), %edx
 ; X86-POPCNT-NEXT:    addl %ecx, %edx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %esi
+; X86-POPCNT-NEXT:    popcntl 28(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 24(%ebp), %esi
 ; X86-POPCNT-NEXT:    addl %ecx, %esi
 ; X86-POPCNT-NEXT:    addl %edx, %esi
 ; X86-POPCNT-NEXT:    xorl %ecx, %ecx
@@ -1071,7 +1101,9 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-POPCNT-NEXT:    movl %ecx, 8(%eax)
 ; X86-POPCNT-NEXT:    movl %ecx, 4(%eax)
 ; X86-POPCNT-NEXT:    movl %esi, (%eax)
+; X86-POPCNT-NEXT:    leal -4(%ebp), %esp
 ; X86-POPCNT-NEXT:    popl %esi
+; X86-POPCNT-NEXT:    popl %ebp
 ; X86-POPCNT-NEXT:    retl $4
 ;
 ; X64-POPCNT-LABEL: cnt128_optsize:
@@ -1118,7 +1150,11 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ;
 ; X86-SSE2-LABEL: cnt128_optsize:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; X86-SSE2-NEXT:    psrlw $1, %xmm0
@@ -1161,11 +1197,17 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
 ; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSE2-NEXT:    movl %edx, (%eax)
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl $4
 ;
 ; X86-SSSE3-LABEL: cnt128_optsize:
 ; X86-SSSE3:       # %bb.0:
-; X86-SSSE3-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSSE3-NEXT:    pushl %ebp
+; X86-SSSE3-NEXT:    movl %esp, %ebp
+; X86-SSSE3-NEXT:    andl $-16, %esp
+; X86-SSSE3-NEXT:    subl $16, %esp
+; X86-SSSE3-NEXT:    movl 8(%ebp), %eax
 ; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X86-SSSE3-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSSE3-NEXT:    movdqa %xmm2, %xmm3
@@ -1198,6 +1240,8 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-SSSE3-NEXT:    movl %ecx, 8(%eax)
 ; X86-SSSE3-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSSE3-NEXT:    movl %edx, (%eax)
+; X86-SSSE3-NEXT:    movl %ebp, %esp
+; X86-SSSE3-NEXT:    popl %ebp
 ; X86-SSSE3-NEXT:    retl $4
   %cnt = tail call i128 @llvm.ctpop.i128(i128 %x)
   ret i128 %cnt
@@ -1415,85 +1459,88 @@ define i64 @cnt64_pgso(i64 %x) nounwind readnone !prof !14 {
 define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-NOSSE-LABEL: cnt128_pgso:
 ; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl %ebx
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    pushl %edi
 ; X86-NOSSE-NEXT:    pushl %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %ebx, %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    andl $858993459, %ebx # imm = 0x33333333
-; X86-NOSSE-NEXT:    shrl $2, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %ebx, %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    shrl $4, %ebx
-; X86-NOSSE-NEXT:    addl %edi, %ebx
-; X86-NOSSE-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %ebx, %edi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %ebx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    andl $858993459, %ebx # imm = 0x33333333
+; X86-NOSSE-NEXT:    andl $-16, %esp
+; X86-NOSSE-NEXT:    movl 24(%ebp), %eax
+; X86-NOSSE-NEXT:    movl 32(%ebp), %ecx
+; X86-NOSSE-NEXT:    movl 36(%ebp), %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl %edx
+; X86-NOSSE-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
 ; X86-NOSSE-NEXT:    shrl $2, %esi
 ; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %ebx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    shrl $4, %ebx
-; X86-NOSSE-NEXT:    addl %esi, %ebx
-; X86-NOSSE-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %ebx, %esi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %esi
-; X86-NOSSE-NEXT:    addl %edi, %esi
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    shrl %edi
-; X86-NOSSE-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %edi, %edx
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X86-NOSSE-NEXT:    shrl $2, %edx
-; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %edi, %edx
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    shrl $4, %edi
-; X86-NOSSE-NEXT:    addl %edx, %edi
-; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %edi, %edx # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edx
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    shrl %edi
-; X86-NOSSE-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %edi, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl $4, %edx
+; X86-NOSSE-NEXT:    addl %esi, %edx
+; X86-NOSSE-NEXT:    movl %ecx, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %esi, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
 ; X86-NOSSE-NEXT:    shrl $2, %ecx
 ; X86-NOSSE-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %edi, %ecx
+; X86-NOSSE-NEXT:    addl %esi, %ecx
 ; X86-NOSSE-NEXT:    movl %ecx, %edi
 ; X86-NOSSE-NEXT:    shrl $4, %edi
 ; X86-NOSSE-NEXT:    addl %ecx, %edi
+; X86-NOSSE-NEXT:    movl 28(%ebp), %esi
+; X86-NOSSE-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %edx, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
 ; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
 ; X86-NOSSE-NEXT:    imull $16843009, %edi, %ecx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %ecx
 ; X86-NOSSE-NEXT:    addl %edx, %ecx
-; X86-NOSSE-NEXT:    addl %esi, %ecx
-; X86-NOSSE-NEXT:    xorl %edx, %edx
-; X86-NOSSE-NEXT:    movl %edx, 12(%eax)
-; X86-NOSSE-NEXT:    movl %edx, 8(%eax)
-; X86-NOSSE-NEXT:    movl %edx, 4(%eax)
-; X86-NOSSE-NEXT:    movl %ecx, (%eax)
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl %edx
+; X86-NOSSE-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NOSSE-NEXT:    shrl $2, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl $4, %edx
+; X86-NOSSE-NEXT:    addl %esi, %edx
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %esi, %eax
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NOSSE-NEXT:    shrl $2, %eax
+; X86-NOSSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %esi, %eax
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    shrl $4, %esi
+; X86-NOSSE-NEXT:    addl %eax, %esi
+; X86-NOSSE-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %edx, %eax # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %eax
+; X86-NOSSE-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %esi, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
+; X86-NOSSE-NEXT:    addl %eax, %edx
+; X86-NOSSE-NEXT:    movl 8(%ebp), %eax
+; X86-NOSSE-NEXT:    addl %ecx, %edx
+; X86-NOSSE-NEXT:    xorl %ecx, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, 12(%eax)
+; X86-NOSSE-NEXT:    movl %ecx, 8(%eax)
+; X86-NOSSE-NEXT:    movl %ecx, 4(%eax)
+; X86-NOSSE-NEXT:    movl %edx, (%eax)
+; X86-NOSSE-NEXT:    leal -8(%ebp), %esp
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %edi
-; X86-NOSSE-NEXT:    popl %ebx
+; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl $4
 ;
 ; X64-BASE-LABEL: cnt128_pgso:
@@ -1538,13 +1585,17 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ;
 ; X86-POPCNT-LABEL: cnt128_pgso:
 ; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    pushl %ebp
+; X86-POPCNT-NEXT:    movl %esp, %ebp
 ; X86-POPCNT-NEXT:    pushl %esi
-; X86-POPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %edx
+; X86-POPCNT-NEXT:    andl $-16, %esp
+; X86-POPCNT-NEXT:    subl $16, %esp
+; X86-POPCNT-NEXT:    movl 8(%ebp), %eax
+; X86-POPCNT-NEXT:    popcntl 36(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 32(%ebp), %edx
 ; X86-POPCNT-NEXT:    addl %ecx, %edx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %esi
+; X86-POPCNT-NEXT:    popcntl 28(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 24(%ebp), %esi
 ; X86-POPCNT-NEXT:    addl %ecx, %esi
 ; X86-POPCNT-NEXT:    addl %edx, %esi
 ; X86-POPCNT-NEXT:    xorl %ecx, %ecx
@@ -1552,7 +1603,9 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-POPCNT-NEXT:    movl %ecx, 8(%eax)
 ; X86-POPCNT-NEXT:    movl %ecx, 4(%eax)
 ; X86-POPCNT-NEXT:    movl %esi, (%eax)
+; X86-POPCNT-NEXT:    leal -4(%ebp), %esp
 ; X86-POPCNT-NEXT:    popl %esi
+; X86-POPCNT-NEXT:    popl %ebp
 ; X86-POPCNT-NEXT:    retl $4
 ;
 ; X64-POPCNT-LABEL: cnt128_pgso:
@@ -1599,7 +1652,11 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ;
 ; X86-SSE2-LABEL: cnt128_pgso:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; X86-SSE2-NEXT:    psrlw $1, %xmm0
@@ -1642,11 +1699,17 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
 ; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSE2-NEXT:    movl %edx, (%eax)
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl $4
 ;
 ; X86-SSSE3-LABEL: cnt128_pgso:
 ; X86-SSSE3:       # %bb.0:
-; X86-SSSE3-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSSE3-NEXT:    pushl %ebp
+; X86-SSSE3-NEXT:    movl %esp, %ebp
+; X86-SSSE3-NEXT:    andl $-16, %esp
+; X86-SSSE3-NEXT:    subl $16, %esp
+; X86-SSSE3-NEXT:    movl 8(%ebp), %eax
 ; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X86-SSSE3-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSSE3-NEXT:    movdqa %xmm2, %xmm3
@@ -1679,6 +1742,8 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-SSSE3-NEXT:    movl %ecx, 8(%eax)
 ; X86-SSSE3-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSSE3-NEXT:    movl %edx, (%eax)
+; X86-SSSE3-NEXT:    movl %ebp, %esp
+; X86-SSSE3-NEXT:    popl %ebp
 ; X86-SSSE3-NEXT:    retl $4
   %cnt = tail call i128 @llvm.ctpop.i128(i128 %x)
   ret i128 %cnt
diff --git a/llvm/test/CodeGen/X86/pr149841.ll b/llvm/test/CodeGen/X86/pr149841.ll
new file mode 100644
index 0000000..c17a617
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr149841.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.bar = type { [5 x ptr] }
+
+@global = external dso_local global %struct.bar
+
+define i1 @foo(ptr %arg, i1 %arg1) {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    cmpq $global+1, %rdi
+; CHECK-NEXT:    setne %al
+; CHECK-NEXT:    andb %sil, %al
+; CHECK-NEXT:    retq
+bb:
+  #dbg_value(ptr @global, !3, !DIExpression(), !5)
+  %icmp = icmp ne ptr %arg, getelementptr inbounds nuw (i8, ptr @global, i64 1)
+  %select = select i1 %arg1, i1 %icmp, i1 false
+  ret i1 %select
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
+!1 = !DIFile(filename: "x.c", directory: "/proc/self/cwd")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !1)
+!4 = distinct !DISubprogram(name: "x", scope: null, file: !1, spFlags: DISPFlagDefinition, unit: !0)
+!5 = !DILocation(line: 0, scope: !4)
+
diff --git a/llvm/test/CodeGen/X86/pr33960.ll b/llvm/test/CodeGen/X86/pr33960.ll
index 44fe777..6ee270e 100644
--- a/llvm/test/CodeGen/X86/pr33960.ll
+++ b/llvm/test/CodeGen/X86/pr33960.ll
@@ -7,12 +7,10 @@
 define void @PR33960() {
 ; X86-LABEL: PR33960:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl $-1, b
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: PR33960:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl $-1, b(%rip)
 ; X64-NEXT:    retq
 entry:
   %tmp = insertelement <4 x i32> <i32 undef, i32 -7, i32 -3, i32 undef>, i32 -2, i32 3
diff --git a/llvm/test/CodeGen/X86/pr46004.ll b/llvm/test/CodeGen/X86/pr46004.ll
index f7c7da0..829d6df 100644
--- a/llvm/test/CodeGen/X86/pr46004.ll
+++ b/llvm/test/CodeGen/X86/pr46004.ll
@@ -6,7 +6,17 @@
 define void @fuzz22357(i128 %a0) {
 ; X86-LABEL: fuzz22357:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movb $0, (%eax)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fuzz22357:
@@ -24,6 +34,15 @@ define void @fuzz22357(i128 %a0) {
 define void @fuzz22723(i128 %a0) {
 ; X86-LABEL: fuzz22723:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fuzz22723:
diff --git a/llvm/test/CodeGen/X86/pr62286.ll b/llvm/test/CodeGen/X86/pr62286.ll
index 2d1b7fc..9728e13 100644
--- a/llvm/test/CodeGen/X86/pr62286.ll
+++ b/llvm/test/CodeGen/X86/pr62286.ll
@@ -42,10 +42,10 @@ define i64 @PR62286(i32 %a) {
 ; AVX2-LABEL: PR62286:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovd %edi, %xmm0
-; AVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm1
-; AVX2-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; AVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/pseudo-probe-desc-check.ll b/llvm/test/CodeGen/X86/pseudo-probe-desc-check.ll
new file mode 100644
index 0000000..841061c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pseudo-probe-desc-check.ll
@@ -0,0 +1,47 @@
+; REQUIRES: asserts
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-windows-msvc < %s -o /dev/null 2>&1 | FileCheck %s
+
+; CHECK: warning: Guid:8314849053352128226 Name:inlinee does not exist in pseudo probe desc
+; CHECK: warning: Guid:6492337042787843907 Name:extract2 does not exist in pseudo probe desc
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+
+define void @extract1() !dbg !8 {
+entry:
+  call void @llvm.pseudoprobe(i64 6028998432455395745, i64 1, i32 0, i64 -1), !dbg !11
+  call void @llvm.pseudoprobe(i64 8314849053352128226, i64 1, i32 0, i64 -1), !dbg !12
+  ret void, !dbg !16
+}
+
+define void @extract2() !dbg !17 {
+entry:
+  call void @llvm.pseudoprobe(i64 6492337042787843907, i64 1, i32 0, i64 -1), !dbg !18
+  ret void, !dbg !18
+}
+
+declare void @llvm.pseudoprobe(i64, i64, i32, i64)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6}
+!llvm.pseudo_probe_desc = !{!7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang", isOptimized: false, emissionKind: LineTablesOnly, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "/home/foo")
+!2 = !{i32 7, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 7, !"uwtable", i32 2}
+!6 = !{i32 7, !"frame-pointer", i32 2}
+!7 = !{i64 6028998432455395745, i64 281479271677951, !"extract1"}
+!8 = distinct !DISubprogram(name: "extract1", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, spFlags: DISPFlagDefinition, unit: !0)
+!9 = !DISubroutineType(types: !10)
+!10 = !{}
+!11 = !DILocation(line: 5, column: 3, scope: !8)
+!12 = !DILocation(line: 2, column: 1, scope: !13, inlinedAt: !14)
+!13 = distinct !DISubprogram(name: "inlinee", scope: !1, file: !1, line: 1, type: !9, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0)
+!14 = distinct !DILocation(line: 5, column: 3, scope: !15)
+!15 = !DILexicalBlockFile(scope: !8, file: !1, discriminator: 455082007)
+!16 = !DILocation(line: 6, column: 1, scope: !8)
+!17 = distinct !DISubprogram(name: "extract2", scope: !1, file: !1, line: 8, type: !9, scopeLine: 8, spFlags: DISPFlagDefinition, unit: !0)
+!18 = !DILocation(line: 9, column: 1, scope: !17)
diff --git a/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll
index 50a967e..ce9723b 100644
--- a/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll
+++ b/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll
@@ -762,11 +762,15 @@ define i32 @x_to_s32(x86_fp80 %a) nounwind {
 define i32 @t_to_u32(fp128 %a) nounwind {
 ; X86-AVX512-WIN-LABEL: t_to_u32:
 ; X86-AVX512-WIN:       # %bb.0:
-; X86-AVX512-WIN-NEXT:    subl $16, %esp
-; X86-AVX512-WIN-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-AVX512-WIN-NEXT:    pushl %ebp
+; X86-AVX512-WIN-NEXT:    movl %esp, %ebp
+; X86-AVX512-WIN-NEXT:    andl $-16, %esp
+; X86-AVX512-WIN-NEXT:    subl $32, %esp
+; X86-AVX512-WIN-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-AVX512-WIN-NEXT:    vmovups %xmm0, (%esp)
 ; X86-AVX512-WIN-NEXT:    calll ___fixunstfsi
-; X86-AVX512-WIN-NEXT:    addl $16, %esp
+; X86-AVX512-WIN-NEXT:    movl %ebp, %esp
+; X86-AVX512-WIN-NEXT:    popl %ebp
 ; X86-AVX512-WIN-NEXT:    retl
 ;
 ; X86-AVX512-LIN-LABEL: t_to_u32:
@@ -797,12 +801,18 @@ define i32 @t_to_u32(fp128 %a) nounwind {
 ;
 ; X86-SSE-WIN-LABEL: t_to_u32:
 ; X86-SSE-WIN:       # %bb.0:
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE-WIN-NEXT:    pushl %ebp
+; X86-SSE-WIN-NEXT:    movl %esp, %ebp
+; X86-SSE-WIN-NEXT:    andl $-16, %esp
+; X86-SSE-WIN-NEXT:    subl $16, %esp
+; X86-SSE-WIN-NEXT:    pushl 20(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 16(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 12(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 8(%ebp)
 ; X86-SSE-WIN-NEXT:    calll ___fixunstfsi
 ; X86-SSE-WIN-NEXT:    addl $16, %esp
+; X86-SSE-WIN-NEXT:    movl %ebp, %esp
+; X86-SSE-WIN-NEXT:    popl %ebp
 ; X86-SSE-WIN-NEXT:    retl
 ;
 ; X86-SSE-LIN-LABEL: t_to_u32:
@@ -835,12 +845,18 @@ define i32 @t_to_u32(fp128 %a) nounwind {
 ;
 ; X87-WIN-LABEL: t_to_u32:
 ; X87-WIN:       # %bb.0:
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X87-WIN-NEXT:    pushl %ebp
+; X87-WIN-NEXT:    movl %esp, %ebp
+; X87-WIN-NEXT:    andl $-16, %esp
+; X87-WIN-NEXT:    subl $16, %esp
+; X87-WIN-NEXT:    pushl 20(%ebp)
+; X87-WIN-NEXT:    pushl 16(%ebp)
+; X87-WIN-NEXT:    pushl 12(%ebp)
+; X87-WIN-NEXT:    pushl 8(%ebp)
 ; X87-WIN-NEXT:    calll ___fixunstfsi
 ; X87-WIN-NEXT:    addl $16, %esp
+; X87-WIN-NEXT:    movl %ebp, %esp
+; X87-WIN-NEXT:    popl %ebp
 ; X87-WIN-NEXT:    retl
 ;
 ; X87-LIN-LABEL: t_to_u32:
@@ -860,11 +876,15 @@ define i32 @t_to_u32(fp128 %a) nounwind {
 define i32 @t_to_s32(fp128 %a) nounwind {
 ; X86-AVX512-WIN-LABEL: t_to_s32:
 ; X86-AVX512-WIN:       # %bb.0:
-; X86-AVX512-WIN-NEXT:    subl $16, %esp
-; X86-AVX512-WIN-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-AVX512-WIN-NEXT:    pushl %ebp
+; X86-AVX512-WIN-NEXT:    movl %esp, %ebp
+; X86-AVX512-WIN-NEXT:    andl $-16, %esp
+; X86-AVX512-WIN-NEXT:    subl $32, %esp
+; X86-AVX512-WIN-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-AVX512-WIN-NEXT:    vmovups %xmm0, (%esp)
 ; X86-AVX512-WIN-NEXT:    calll ___fixtfsi
-; X86-AVX512-WIN-NEXT:    addl $16, %esp
+; X86-AVX512-WIN-NEXT:    movl %ebp, %esp
+; X86-AVX512-WIN-NEXT:    popl %ebp
 ; X86-AVX512-WIN-NEXT:    retl
 ;
 ; X86-AVX512-LIN-LABEL: t_to_s32:
@@ -895,12 +915,18 @@ define i32 @t_to_s32(fp128 %a) nounwind {
 ;
 ; X86-SSE-WIN-LABEL: t_to_s32:
 ; X86-SSE-WIN:       # %bb.0:
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE-WIN-NEXT:    pushl %ebp
+; X86-SSE-WIN-NEXT:    movl %esp, %ebp
+; X86-SSE-WIN-NEXT:    andl $-16, %esp
+; X86-SSE-WIN-NEXT:    subl $16, %esp
+; X86-SSE-WIN-NEXT:    pushl 20(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 16(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 12(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 8(%ebp)
 ; X86-SSE-WIN-NEXT:    calll ___fixtfsi
 ; X86-SSE-WIN-NEXT:    addl $16, %esp
+; X86-SSE-WIN-NEXT:    movl %ebp, %esp
+; X86-SSE-WIN-NEXT:    popl %ebp
 ; X86-SSE-WIN-NEXT:    retl
 ;
 ; X86-SSE-LIN-LABEL: t_to_s32:
@@ -933,12 +959,18 @@ define i32 @t_to_s32(fp128 %a) nounwind {
 ;
 ; X87-WIN-LABEL: t_to_s32:
 ; X87-WIN:       # %bb.0:
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X87-WIN-NEXT:    pushl %ebp
+; X87-WIN-NEXT:    movl %esp, %ebp
+; X87-WIN-NEXT:    andl $-16, %esp
+; X87-WIN-NEXT:    subl $16, %esp
+; X87-WIN-NEXT:    pushl 20(%ebp)
+; X87-WIN-NEXT:    pushl 16(%ebp)
+; X87-WIN-NEXT:    pushl 12(%ebp)
+; X87-WIN-NEXT:    pushl 8(%ebp)
 ; X87-WIN-NEXT:    calll ___fixtfsi
 ; X87-WIN-NEXT:    addl $16, %esp
+; X87-WIN-NEXT:    movl %ebp, %esp
+; X87-WIN-NEXT:    popl %ebp
 ; X87-WIN-NEXT:    retl
 ;
 ; X87-LIN-LABEL: t_to_s32:
diff --git a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
index f516db8..3287869 100644
--- a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
+++ b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
@@ -1417,11 +1417,15 @@ define i64 @x_to_s64(x86_fp80 %a) nounwind {
 define i64 @t_to_u64(fp128 %a) nounwind {
 ; X86-AVX512-WIN-LABEL: t_to_u64:
 ; X86-AVX512-WIN:       # %bb.0:
-; X86-AVX512-WIN-NEXT:    subl $16, %esp
-; X86-AVX512-WIN-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-AVX512-WIN-NEXT:    pushl %ebp
+; X86-AVX512-WIN-NEXT:    movl %esp, %ebp
+; X86-AVX512-WIN-NEXT:    andl $-16, %esp
+; X86-AVX512-WIN-NEXT:    subl $32, %esp
+; X86-AVX512-WIN-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-AVX512-WIN-NEXT:    vmovups %xmm0, (%esp)
 ; X86-AVX512-WIN-NEXT:    calll ___fixunstfdi
-; X86-AVX512-WIN-NEXT:    addl $16, %esp
+; X86-AVX512-WIN-NEXT:    movl %ebp, %esp
+; X86-AVX512-WIN-NEXT:    popl %ebp
 ; X86-AVX512-WIN-NEXT:    retl
 ;
 ; X86-AVX512-LIN-LABEL: t_to_u64:
@@ -1452,12 +1456,18 @@ define i64 @t_to_u64(fp128 %a) nounwind {
 ;
 ; X86-SSE-WIN-LABEL: t_to_u64:
 ; X86-SSE-WIN:       # %bb.0:
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE-WIN-NEXT:    pushl %ebp
+; X86-SSE-WIN-NEXT:    movl %esp, %ebp
+; X86-SSE-WIN-NEXT:    andl $-16, %esp
+; X86-SSE-WIN-NEXT:    subl $16, %esp
+; X86-SSE-WIN-NEXT:    pushl 20(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 16(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 12(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 8(%ebp)
 ; X86-SSE-WIN-NEXT:    calll ___fixunstfdi
 ; X86-SSE-WIN-NEXT:    addl $16, %esp
+; X86-SSE-WIN-NEXT:    movl %ebp, %esp
+; X86-SSE-WIN-NEXT:    popl %ebp
 ; X86-SSE-WIN-NEXT:    retl
 ;
 ; X86-SSE-LIN-LABEL: t_to_u64:
@@ -1490,12 +1500,18 @@ define i64 @t_to_u64(fp128 %a) nounwind {
 ;
 ; X87-WIN-LABEL: t_to_u64:
 ; X87-WIN:       # %bb.0:
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X87-WIN-NEXT:    pushl %ebp
+; X87-WIN-NEXT:    movl %esp, %ebp
+; X87-WIN-NEXT:    andl $-16, %esp
+; X87-WIN-NEXT:    subl $16, %esp
+; X87-WIN-NEXT:    pushl 20(%ebp)
+; X87-WIN-NEXT:    pushl 16(%ebp)
+; X87-WIN-NEXT:    pushl 12(%ebp)
+; X87-WIN-NEXT:    pushl 8(%ebp)
 ; X87-WIN-NEXT:    calll ___fixunstfdi
 ; X87-WIN-NEXT:    addl $16, %esp
+; X87-WIN-NEXT:    movl %ebp, %esp
+; X87-WIN-NEXT:    popl %ebp
 ; X87-WIN-NEXT:    retl
 ;
 ; X87-LIN-LABEL: t_to_u64:
@@ -1515,11 +1531,15 @@ define i64 @t_to_u64(fp128 %a) nounwind {
 define i64 @t_to_s64(fp128 %a) nounwind {
 ; X86-AVX512-WIN-LABEL: t_to_s64:
 ; X86-AVX512-WIN:       # %bb.0:
-; X86-AVX512-WIN-NEXT:    subl $16, %esp
-; X86-AVX512-WIN-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-AVX512-WIN-NEXT:    pushl %ebp
+; X86-AVX512-WIN-NEXT:    movl %esp, %ebp
+; X86-AVX512-WIN-NEXT:    andl $-16, %esp
+; X86-AVX512-WIN-NEXT:    subl $32, %esp
+; X86-AVX512-WIN-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-AVX512-WIN-NEXT:    vmovups %xmm0, (%esp)
 ; X86-AVX512-WIN-NEXT:    calll ___fixtfdi
-; X86-AVX512-WIN-NEXT:    addl $16, %esp
+; X86-AVX512-WIN-NEXT:    movl %ebp, %esp
+; X86-AVX512-WIN-NEXT:    popl %ebp
 ; X86-AVX512-WIN-NEXT:    retl
 ;
 ; X86-AVX512-LIN-LABEL: t_to_s64:
@@ -1550,12 +1570,18 @@ define i64 @t_to_s64(fp128 %a) nounwind {
 ;
 ; X86-SSE-WIN-LABEL: t_to_s64:
 ; X86-SSE-WIN:       # %bb.0:
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE-WIN-NEXT:    pushl %ebp
+; X86-SSE-WIN-NEXT:    movl %esp, %ebp
+; X86-SSE-WIN-NEXT:    andl $-16, %esp
+; X86-SSE-WIN-NEXT:    subl $16, %esp
+; X86-SSE-WIN-NEXT:    pushl 20(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 16(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 12(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 8(%ebp)
 ; X86-SSE-WIN-NEXT:    calll ___fixtfdi
 ; X86-SSE-WIN-NEXT:    addl $16, %esp
+; X86-SSE-WIN-NEXT:    movl %ebp, %esp
+; X86-SSE-WIN-NEXT:    popl %ebp
 ; X86-SSE-WIN-NEXT:    retl
 ;
 ; X86-SSE-LIN-LABEL: t_to_s64:
@@ -1588,12 +1614,18 @@ define i64 @t_to_s64(fp128 %a) nounwind {
 ;
 ; X87-WIN-LABEL: t_to_s64:
 ; X87-WIN:       # %bb.0:
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X87-WIN-NEXT:    pushl %ebp
+; X87-WIN-NEXT:    movl %esp, %ebp
+; X87-WIN-NEXT:    andl $-16, %esp
+; X87-WIN-NEXT:    subl $16, %esp
+; X87-WIN-NEXT:    pushl 20(%ebp)
+; X87-WIN-NEXT:    pushl 16(%ebp)
+; X87-WIN-NEXT:    pushl 12(%ebp)
+; X87-WIN-NEXT:    pushl 8(%ebp)
 ; X87-WIN-NEXT:    calll ___fixtfdi
 ; X87-WIN-NEXT:    addl $16, %esp
+; X87-WIN-NEXT:    movl %ebp, %esp
+; X87-WIN-NEXT:    popl %ebp
 ; X87-WIN-NEXT:    retl
 ;
 ; X87-LIN-LABEL: t_to_s64:
diff --git a/llvm/test/CodeGen/X86/scmp.ll b/llvm/test/CodeGen/X86/scmp.ll
index 87491362..8a28722 100644
--- a/llvm/test/CodeGen/X86/scmp.ll
+++ b/llvm/test/CodeGen/X86/scmp.ll
@@ -118,30 +118,33 @@ define i8 @scmp.8.128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: scmp.8.128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    sbbl %eax, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    sbbl %ebp, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %edi
+; X86-NEXT:    cmpl %ecx, 8(%ebp)
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl 16(%ebp), %ebx
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl 36(%ebp), %ebx
+; X86-NEXT:    movl 20(%ebp), %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    setl %cl
-; X86-NEXT:    cmpl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %esi, %ebp
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    cmpl 8(%ebp), %esi
+; X86-NEXT:    sbbl 12(%ebp), %eax
+; X86-NEXT:    sbbl 16(%ebp), %edi
+; X86-NEXT:    sbbl %edx, %ebx
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    subb %cl, %al
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll
index 4925f8b..392bc83 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix.ll
@@ -307,69 +307,70 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $80, %esp
+; X86-NEXT:    subl $112, %esp
 ; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %edi
+; X86-NEXT:    movl 16(%ebp), %eax
 ; X86-NEXT:    movl 20(%ebp), %edx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    shldl $31, %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl $31, %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    shldl $31, %edi, %esi
+; X86-NEXT:    shldl $31, %ecx, %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    shll $31, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    subl $1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %ebx
-; X86-NEXT:    testl %esi, %esi
-; X86-NEXT:    sets %al
-; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    sets %cl
-; X86-NEXT:    xorb %al, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 20(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    subl $1, %esi
+; X86-NEXT:    sbbl $0, %edi
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    sets %al
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    sets %bl
+; X86-NEXT:    xorb %al, %bl
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    setne %al
-; X86-NEXT:    testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    testb %bl, %al
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
index e7727a0..7df490f 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -370,67 +370,68 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $96, %esp
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl 20(%ebp), %esi
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl $31, %eax, %edi
-; X86-NEXT:    shldl $31, %ecx, %eax
+; X86-NEXT:    subl $128, %esp
+; X86-NEXT:    movl 8(%ebp), %esi
+; X86-NEXT:    movl 12(%ebp), %edi
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    movl 20(%ebp), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    sarl $31, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll $31, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ecx
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    shldl $31, %edi, %ebx
+; X86-NEXT:    shldl $31, %esi, %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    shll $31, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 20(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    subl $1, %esi
-; X86-NEXT:    sbbl $0, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    subl $1, %edi
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    testl %ebx, %ebx
-; X86-NEXT:    sets %al
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    sbbl $0, %ebx
 ; X86-NEXT:    testl %ecx, %ecx
-; X86-NEXT:    sets %dl
-; X86-NEXT:    xorb %al, %dl
-; X86-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    sets %al
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    sets %cl
+; X86-NEXT:    xorb %al, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -438,41 +439,38 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovel %esi, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    cmpl $-1, %esi
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    cmpl $-1, %edi
 ; X86-NEXT:    sbbl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    sbbl $0, %ecx
-; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    sbbl $0, %ecx
 ; X86-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
-; X86-NEXT:    cmovll %eax, %edx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    cmovgel %ecx, %edi
-; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    cmovgel %ecx, %ebx
+; X86-NEXT:    cmovgel %ecx, %eax
 ; X86-NEXT:    movl $-1, %ecx
-; X86-NEXT:    cmovgel %ecx, %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    negl %edi
-; X86-NEXT:    movl $-2147483648, %edi # imm = 0x80000000
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    movl $-1, %edi
-; X86-NEXT:    sbbl %ebx, %edi
-; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    cmovgel %ecx, %edi
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    negl %esi
+; X86-NEXT:    movl $-2147483648, %esi # imm = 0x80000000
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    movl $-1, %esi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    cmovgel %eax, %esi
+; X86-NEXT:    cmovgel %eax, %edi
 ; X86-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
 ; X86-NEXT:    cmovgel %eax, %edx
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -805,137 +803,155 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $208, %esp
-; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    subl $240, %esp
+; X86-NEXT:    movl 12(%ebp), %esi
+; X86-NEXT:    movl 20(%ebp), %edi
 ; X86-NEXT:    movl 16(%ebp), %ebx
-; X86-NEXT:    movl 32(%ebp), %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    leal (%ebx,%ebx), %eax
 ; X86-NEXT:    shrl $31, %ebx
 ; X86-NEXT:    shldl $31, %eax, %ebx
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 36(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    leal (%ecx,%ecx), %edx
-; X86-NEXT:    shrl $31, %ecx
-; X86-NEXT:    shldl $31, %edx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl 36(%ebp)
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %edx
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    leal (%edi,%edi), %eax
+; X86-NEXT:    shrl $31, %edi
+; X86-NEXT:    shldl $31, %eax, %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl 32(%ebp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl 36(%ebp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl 28(%ebp), %edx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    leal (%ecx,%ecx), %eax
-; X86-NEXT:    shrl $31, %ecx
-; X86-NEXT:    shldl $31, %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    leal (%esi,%esi), %eax
+; X86-NEXT:    shrl $31, %esi
+; X86-NEXT:    shldl $31, %eax, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl 40(%ebp), %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 24(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    leal (%ecx,%ecx), %edx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    leal (%ecx,%ecx), %eax
 ; X86-NEXT:    shrl $31, %ecx
-; X86-NEXT:    shldl $31, %edx, %ecx
+; X86-NEXT:    shldl $31, %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl 40(%ebp)
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %edx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl 28(%ebp)
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    subl $1, %eax
@@ -949,18 +965,18 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    sets %bl
-; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    sets %bh
-; X86-NEXT:    xorb %bl, %bh
+; X86-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    setne %bl
-; X86-NEXT:    testb %bh, %bl
+; X86-NEXT:    setne %bh
+; X86-NEXT:    testb %bl, %bh
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
@@ -1107,36 +1123,24 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    subl $1, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %edi
+; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    sets %al
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    testl %edx, %edx
-; X86-NEXT:    sets %ah
-; X86-NEXT:    xorb %al, %ah
-; X86-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl 40(%ebp)
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    sets %cl
+; X86-NEXT:    xorb %al, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -1144,38 +1148,38 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    cmpl $-1, %ebx
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sbbl $0, %eax
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    cmovgel %eax, %esi
-; X86-NEXT:    cmovgel %eax, %ecx
 ; X86-NEXT:    cmovgel %eax, %edi
+; X86-NEXT:    cmovgel %eax, %ecx
+; X86-NEXT:    cmovgel %eax, %esi
 ; X86-NEXT:    movl $-1, %edx
 ; X86-NEXT:    cmovgel %edx, %ebx
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    sbbl %esi, %eax
 ; X86-NEXT:    movl $-1, %eax
 ; X86-NEXT:    sbbl %ecx, %eax
 ; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    sbbl %edi, %eax
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    cmovgel %eax, %ebx
-; X86-NEXT:    cmovgel %edx, %edi
-; X86-NEXT:    shldl $31, %ebx, %edi
+; X86-NEXT:    cmovgel %edx, %esi
+; X86-NEXT:    shldl $31, %ebx, %esi
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
diff --git a/llvm/test/CodeGen/X86/seh-catch-all.ll b/llvm/test/CodeGen/X86/seh-catch-all.ll
index 5250bb9..4e25aab 100644
--- a/llvm/test/CodeGen/X86/seh-catch-all.ll
+++ b/llvm/test/CodeGen/X86/seh-catch-all.ll
@@ -40,7 +40,7 @@ catchall:
 ; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16
 ; CHECK-NEXT: .Llsda_begin0:
 ; CHECK-NEXT: .long .Ltmp{{[0-9]+}}@IMGREL
-; CHECK-NEXT: .long .Ltmp{{[0-9]+}}@IMGREL+1
+; CHECK-NEXT: .long .Ltmp{{[0-9]+}}@IMGREL
 ; CHECK-NEXT: .long 1
 ; CHECK-NEXT: .long .LBB0_2@IMGREL
 ; CHECK-NEXT: .Llsda_end0:
diff --git a/llvm/test/CodeGen/X86/seh-catchpad.ll b/llvm/test/CodeGen/X86/seh-catchpad.ll
index d958580..cb85f39 100644
--- a/llvm/test/CodeGen/X86/seh-catchpad.ll
+++ b/llvm/test/CodeGen/X86/seh-catchpad.ll
@@ -123,23 +123,23 @@ __except.ret:                                     ; preds = %catch.dispatch.7
 ; CHECK-NEXT:         .long   (.Llsda_end0-.Llsda_begin0)/16
 ; CHECK-NEXT: .Llsda_begin0:
 ; CHECK-NEXT:         .long   .Ltmp0@IMGREL
-; CHECK-NEXT:         .long   .Ltmp1@IMGREL+1
+; CHECK-NEXT:         .long   .Ltmp1@IMGREL
 ; CHECK-NEXT:         .long   1
 ; CHECK-NEXT:         .long   .LBB1_[[except1bb]]@IMGREL
 ; CHECK-NEXT:         .long   .Ltmp0@IMGREL
-; CHECK-NEXT:         .long   .Ltmp1@IMGREL+1
+; CHECK-NEXT:         .long   .Ltmp1@IMGREL
 ; CHECK-NEXT:         .long   "?filt$0@0@main@@"@IMGREL
 ; CHECK-NEXT:         .long   .LBB1_[[except2bb]]@IMGREL
 ; CHECK-NEXT:         .long   .Ltmp2@IMGREL
-; CHECK-NEXT:         .long   .Ltmp3@IMGREL+1
+; CHECK-NEXT:         .long   .Ltmp3@IMGREL
 ; CHECK-NEXT:         .long   "?dtor$[[finbb:[0-9]+]]@?0?main@4HA"@IMGREL
 ; CHECK-NEXT:         .long   0
 ; CHECK-NEXT:         .long   .Ltmp2@IMGREL
-; CHECK-NEXT:         .long   .Ltmp3@IMGREL+1
+; CHECK-NEXT:         .long   .Ltmp3@IMGREL
 ; CHECK-NEXT:         .long   "?filt$0@0@main@@"@IMGREL
 ; CHECK-NEXT:         .long   .LBB1_3@IMGREL
 ; CHECK-NEXT:         .long   .Ltmp6@IMGREL
-; CHECK-NEXT:         .long   .Ltmp7@IMGREL+1
+; CHECK-NEXT:         .long   .Ltmp7@IMGREL
 ; CHECK-NEXT:         .long   "?filt$0@0@main@@"@IMGREL
 ; CHECK-NEXT:         .long   .LBB1_3@IMGREL
 ; CHECK-NEXT: .Llsda_end0:
diff --git a/llvm/test/CodeGen/X86/seh-except-finally.ll b/llvm/test/CodeGen/X86/seh-except-finally.ll
index 7f70655..539d776 100644
--- a/llvm/test/CodeGen/X86/seh-except-finally.ll
+++ b/llvm/test/CodeGen/X86/seh-except-finally.ll
@@ -83,15 +83,15 @@ __try.cont:                                       ; preds = %__except, %invoke.c
 ; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16
 ; CHECK-NEXT: .Llsda_begin0:
 ; CHECK-NEXT: .long .Ltmp0@IMGREL
-; CHECK-NEXT: .long .Ltmp1@IMGREL+1
+; CHECK-NEXT: .long .Ltmp1@IMGREL
 ; CHECK-NEXT: .long "?dtor$2@?0?use_both@4HA"@IMGREL
 ; CHECK-NEXT: .long 0
 ; CHECK-NEXT: .long .Ltmp0@IMGREL
-; CHECK-NEXT: .long .Ltmp1@IMGREL+1
+; CHECK-NEXT: .long .Ltmp1@IMGREL
 ; CHECK-NEXT: .long "?filt$0@0@use_both@@"@IMGREL
 ; CHECK-NEXT: .long .LBB0_{{[0-9]+}}@IMGREL
 ; CHECK-NEXT: .long .Ltmp4@IMGREL
-; CHECK-NEXT: .long .Ltmp5@IMGREL+1
+; CHECK-NEXT: .long .Ltmp5@IMGREL
 ; CHECK-NEXT: .long "?filt$0@0@use_both@@"@IMGREL
 ; CHECK-NEXT: .long .LBB0_{{[0-9]+}}@IMGREL
 ; CHECK-NEXT: .Llsda_end0:
diff --git a/llvm/test/CodeGen/X86/seh-finally.ll b/llvm/test/CodeGen/X86/seh-finally.ll
index 41823df..6093e5e 100644
--- a/llvm/test/CodeGen/X86/seh-finally.ll
+++ b/llvm/test/CodeGen/X86/seh-finally.ll
@@ -30,7 +30,7 @@ lpad:                                             ; preds = %entry
 ; X64-NEXT: .long   (.Llsda_end0-.Llsda_begin0)/16 # Number of call sites
 ; X64-NEXT: .Llsda_begin0:
 ; X64-NEXT: .long   .Ltmp0@IMGREL # LabelStart
-; X64-NEXT: .long   .Ltmp1@IMGREL+1 # LabelEnd
+; X64-NEXT: .long   .Ltmp1@IMGREL # LabelEnd
 ; X64-NEXT: .long   "?dtor$2@?0?main@4HA"@IMGREL # FinallyFunclet
 ; X64-NEXT: .long   0               # Null
 ; X64-NEXT: .Llsda_end0:
diff --git a/llvm/test/CodeGen/X86/seh-safe-div.ll b/llvm/test/CodeGen/X86/seh-safe-div.ll
index 542d9f6..20169f8 100644
--- a/llvm/test/CodeGen/X86/seh-safe-div.ll
+++ b/llvm/test/CodeGen/X86/seh-safe-div.ll
@@ -60,6 +60,7 @@ __try.cont:
 ; CHECK: .Ltmp0:
 ; CHECK: leaq [[rloc:.*\(%rbp\)]], %rcx
 ; CHECK: callq try_body
+; CHECK: nop
 ; CHECK-NEXT: .Ltmp1
 ; CHECK: [[cont_bb:\.LBB0_[0-9]+]]:
 ; CHECK: movl [[rloc]], %eax
@@ -82,11 +83,11 @@ __try.cont:
 ; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16
 ; CHECK-NEXT: .Llsda_begin0:
 ; CHECK-NEXT: .long .Ltmp0@IMGREL
-; CHECK-NEXT: .long .Ltmp1@IMGREL+1
+; CHECK-NEXT: .long .Ltmp1@IMGREL
 ; CHECK-NEXT: .long safe_div_filt0@IMGREL
 ; CHECK-NEXT: .long [[handler0]]@IMGREL
 ; CHECK-NEXT: .long .Ltmp0@IMGREL
-; CHECK-NEXT: .long .Ltmp1@IMGREL+1
+; CHECK-NEXT: .long .Ltmp1@IMGREL
 ; CHECK-NEXT: .long safe_div_filt1@IMGREL
 ; CHECK-NEXT: .long [[handler1]]@IMGREL
 ; CHECK-NEXT: .Llsda_end0:
diff --git a/llvm/test/CodeGen/X86/seh-unwind-inline-asm-codegen.ll b/llvm/test/CodeGen/X86/seh-unwind-inline-asm-codegen.ll
index 2c576df..5a6aeb6 100644
--- a/llvm/test/CodeGen/X86/seh-unwind-inline-asm-codegen.ll
+++ b/llvm/test/CodeGen/X86/seh-unwind-inline-asm-codegen.ll
@@ -56,8 +56,8 @@ declare dso_local void @printf(ptr, ...)
 ; CHECK-NEXT:$ip2state$test:
 ; CHECK-NEXT:    .long    .Lfunc_begin0@IMGREL            # IP
 ; CHECK-NEXT:    .long    -1                              # ToState
-; CHECK-NEXT:    .long    .Ltmp0@IMGREL+1                 # IP
+; CHECK-NEXT:    .long    .Ltmp0@IMGREL                   # IP
 ; CHECK-NEXT:    .long    0                               # ToState
-; CHECK-NEXT:    .long    .Ltmp1@IMGREL+1                 # IP
+; CHECK-NEXT:    .long    .Ltmp1@IMGREL                   # IP
 ; CHECK-NEXT:    .long    -1                              # ToState
 
diff --git a/llvm/test/CodeGen/X86/select-optimize.ll b/llvm/test/CodeGen/X86/select-optimize.ll
index d273d09..c7cf9cb 100644
--- a/llvm/test/CodeGen/X86/select-optimize.ll
+++ b/llvm/test/CodeGen/X86/select-optimize.ll
@@ -229,9 +229,10 @@ define i32 @expensive_val_operand4(ptr nocapture %a, i32 %b, i32 %y, i1 %cmp) {
 }
 
 ; Expensive cold value operand with unsafe-to-sink (due to lifetime-end marker) load (partial slice sinking).
-define i32 @expensive_val_operand5(ptr nocapture %a, i32 %b, i32 %y, i1 %cmp) {
+define i32 @expensive_val_operand5(i32 %b, i32 %y, i1 %cmp) {
 ; CHECK-LABEL: @expensive_val_operand5(
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[A]], align 8
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[A]])
 ; CHECK-NEXT:    [[CMP_FROZEN:%.*]] = freeze i1 [[CMP:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_END:%.*]], !prof [[PROF18]]
@@ -242,6 +243,7 @@ define i32 @expensive_val_operand5(ptr nocapture %a, i32 %b, i32 %y, i1 %cmp) {
 ; CHECK-NEXT:    [[SEL:%.*]] = phi i32 [ [[X]], [[SELECT_TRUE_SINK]] ], [ [[Y:%.*]], [[TMP0:%.*]] ]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
+  %a = alloca i32
   %load = load i32, ptr %a, align 8
   call void @llvm.lifetime.end.p0(i64 2, ptr nonnull %a)
   %x = add i32 %load, %b
diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
index d2b292f..2ac2be5 100644
--- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
+++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
@@ -119,8 +119,8 @@ define void @failing(ptr %0, ptr %1) nounwind {
 ; CHECK-AVX2-NEXT:  .LBB0_2: # %vector.body
 ; CHECK-AVX2-NEXT:    # Parent Loop BB0_1 Depth=1
 ; CHECK-AVX2-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-AVX2-NEXT:    vmovdqu 1024(%rdx,%rsi), %ymm5
-; CHECK-AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6
+; CHECK-AVX2-NEXT:    vmovdqu 1024(%rdx,%rsi), %xmm5
+; CHECK-AVX2-NEXT:    vmovdqu 1040(%rdx,%rsi), %xmm6
 ; CHECK-AVX2-NEXT:    vpextrq $1, %xmm5, %rdi
 ; CHECK-AVX2-NEXT:    vpextrq $1, %xmm6, %r8
 ; CHECK-AVX2-NEXT:    vmovq %xmm5, %r9
diff --git a/llvm/test/CodeGen/X86/shift-combine.ll b/llvm/test/CodeGen/X86/shift-combine.ll
index 76cb4e87..dfeef48 100644
--- a/llvm/test/CodeGen/X86/shift-combine.ll
+++ b/llvm/test/CodeGen/X86/shift-combine.ll
@@ -792,14 +792,24 @@ define <4 x i32> @or_tree_with_mismatching_shifts_vec_i32(<4 x i32> %a, <4 x i32
 define void @combineShiftOfShiftedLogic(i128 %a1, i32 %a2, ptr %p) {
 ; X86-LABEL: combineShiftOfShiftedLogic:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %ecx
 ; X86-NEXT:    movl %eax, 20(%ecx)
 ; X86-NEXT:    movl $0, 16(%ecx)
 ; X86-NEXT:    movl $0, 12(%ecx)
 ; X86-NEXT:    movl $0, 8(%ecx)
 ; X86-NEXT:    movl $0, 4(%ecx)
 ; X86-NEXT:    movl $0, (%ecx)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: combineShiftOfShiftedLogic:
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 767bd77..9323cd5 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -212,9 +212,18 @@ entry:
 }
 
 define void @test_lshr_i128_outofrange(i128 %x, ptr nocapture %r) nounwind {
-; ALL-LABEL: test_lshr_i128_outofrange:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    ret{{[l|q]}}
+; i686-LABEL: test_lshr_i128_outofrange:
+; i686:       # %bb.0: # %entry
+; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    movl %ebp, %esp
+; i686-NEXT:    popl %ebp
+; i686-NEXT:    retl
+;
+; x86_64-LABEL: test_lshr_i128_outofrange:
+; x86_64:       # %bb.0: # %entry
+; x86_64-NEXT:    retq
 entry:
 	%0 = lshr i128 %x, -1
 	store i128 %0, ptr %r, align 16
@@ -222,9 +231,18 @@ entry:
 }
 
 define void @test_ashr_i128_outofrange(i128 %x, ptr nocapture %r) nounwind {
-; ALL-LABEL: test_ashr_i128_outofrange:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    ret{{[l|q]}}
+; i686-LABEL: test_ashr_i128_outofrange:
+; i686:       # %bb.0: # %entry
+; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    movl %ebp, %esp
+; i686-NEXT:    popl %ebp
+; i686-NEXT:    retl
+;
+; x86_64-LABEL: test_ashr_i128_outofrange:
+; x86_64:       # %bb.0: # %entry
+; x86_64-NEXT:    retq
 entry:
 	%0 = ashr i128 %x, -1
 	store i128 %0, ptr %r, align 16
@@ -232,9 +250,18 @@ entry:
 }
 
 define void @test_shl_i128_outofrange(i128 %x, ptr nocapture %r) nounwind {
-; ALL-LABEL: test_shl_i128_outofrange:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    ret{{[l|q]}}
+; i686-LABEL: test_shl_i128_outofrange:
+; i686:       # %bb.0: # %entry
+; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    movl %ebp, %esp
+; i686-NEXT:    popl %ebp
+; i686-NEXT:    retl
+;
+; x86_64-LABEL: test_shl_i128_outofrange:
+; x86_64:       # %bb.0: # %entry
+; x86_64-NEXT:    retq
 entry:
 	%0 = shl i128 %x, -1
 	store i128 %0, ptr %r, align 16
@@ -874,26 +901,31 @@ define <2 x i256> @shl_zext_lshr_outofrange(<2 x i128> %a0) {
 define i128 @lshr_shl_mask(i128 %a0) {
 ; i686-LABEL: lshr_shl_mask:
 ; i686:       # %bb.0:
-; i686-NEXT:    pushl %edi
+; i686-NEXT:    pushl %ebp
 ; i686-NEXT:    .cfi_def_cfa_offset 8
+; i686-NEXT:    .cfi_offset %ebp, -8
+; i686-NEXT:    movl %esp, %ebp
+; i686-NEXT:    .cfi_def_cfa_register %ebp
+; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    .cfi_def_cfa_offset 12
-; i686-NEXT:    .cfi_offset %esi, -12
-; i686-NEXT:    .cfi_offset %edi, -8
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    .cfi_offset %esi, -16
+; i686-NEXT:    .cfi_offset %edi, -12
+; i686-NEXT:    movl 8(%ebp), %eax
+; i686-NEXT:    movl 24(%ebp), %ecx
+; i686-NEXT:    movl 28(%ebp), %edx
+; i686-NEXT:    movl 32(%ebp), %esi
 ; i686-NEXT:    movl $2147483647, %edi # imm = 0x7FFFFFFF
-; i686-NEXT:    andl {{[0-9]+}}(%esp), %edi
+; i686-NEXT:    andl 36(%ebp), %edi
 ; i686-NEXT:    movl %edi, 12(%eax)
 ; i686-NEXT:    movl %esi, 8(%eax)
 ; i686-NEXT:    movl %edx, 4(%eax)
 ; i686-NEXT:    movl %ecx, (%eax)
+; i686-NEXT:    leal -8(%ebp), %esp
 ; i686-NEXT:    popl %esi
-; i686-NEXT:    .cfi_def_cfa_offset 8
 ; i686-NEXT:    popl %edi
-; i686-NEXT:    .cfi_def_cfa_offset 4
+; i686-NEXT:    popl %ebp
+; i686-NEXT:    .cfi_def_cfa %esp, 4
 ; i686-NEXT:    retl $4
 ;
 ; x86_64-LABEL: lshr_shl_mask:
diff --git a/llvm/test/CodeGen/X86/smax.ll b/llvm/test/CodeGen/X86/smax.ll
index 86891e9..509d444 100644
--- a/llvm/test/CodeGen/X86/smax.ll
+++ b/llvm/test/CodeGen/X86/smax.ll
@@ -151,31 +151,34 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl %ebx, %edx
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    cmpl 24(%ebp), %ebx
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    sbbl 28(%ebp), %esi
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl 52(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sbbl %ebp, %eax
-; X86-NEXT:    cmovll %ebx, %edx
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    cmovll 24(%ebp), %ebx
+; X86-NEXT:    cmovll 28(%ebp), %edi
+; X86-NEXT:    cmovll 32(%ebp), %edx
+; X86-NEXT:    cmovll %esi, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -717,29 +720,32 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind {
 ;
 ; X86-LABEL: test_signbits_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    shrdl $28, %edi, %ecx
-; X86-NEXT:    sarl $28, %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    shrdl $28, %edx, %ecx
+; X86-NEXT:    sarl $28, %edx
 ; X86-NEXT:    cmpl %esi, %ecx
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    cmovll %esi, %ecx
-; X86-NEXT:    cmovll %edx, %edi
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    cmovll %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %ax = ashr i128 %a, 64
   %bx = ashr i128 %b, 92
diff --git a/llvm/test/CodeGen/X86/smin.ll b/llvm/test/CodeGen/X86/smin.ll
index 8907f6c..5e9fe27 100644
--- a/llvm/test/CodeGen/X86/smin.ll
+++ b/llvm/test/CodeGen/X86/smin.ll
@@ -151,32 +151,34 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl %edx, %ebx
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    cmpl %ecx, 24(%ebp)
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    sbbl %esi, %ebx
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %edi
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    sbbl %ebp, %eax
-; X86-NEXT:    cmovll %ebx, %edx
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmovll %edi, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebp, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    cmovll 24(%ebp), %ecx
+; X86-NEXT:    cmovll 28(%ebp), %edx
+; X86-NEXT:    cmovll 32(%ebp), %esi
+; X86-NEXT:    cmovll %edi, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -718,29 +720,32 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind {
 ;
 ; X86-LABEL: test_signbits_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    shrdl $28, %edi, %ecx
-; X86-NEXT:    sarl $28, %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    shrdl $28, %edx, %ecx
+; X86-NEXT:    sarl $28, %edx
 ; X86-NEXT:    cmpl %ecx, %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    sbbl %edx, %edi
 ; X86-NEXT:    cmovll %esi, %ecx
-; X86-NEXT:    cmovll %edx, %edi
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    cmovll %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %ax = ashr i128 %a, 64
   %bx = ashr i128 %b, 92
diff --git a/llvm/test/CodeGen/X86/stack-coloring-wineh.ll b/llvm/test/CodeGen/X86/stack-coloring-wineh.ll
index e2de2ff..74fe07e 100644
--- a/llvm/test/CodeGen/X86/stack-coloring-wineh.ll
+++ b/llvm/test/CodeGen/X86/stack-coloring-wineh.ll
@@ -84,12 +84,12 @@ define void @pr66984(ptr %arg) personality ptr @__CxxFrameHandler3 {
 ; X86_64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X86_64-NEXT:  .Ltmp0:
 ; X86_64-NEXT:    callq throw
+; X86_64-NEXT:    nop
 ; X86_64-NEXT:  .Ltmp1:
 ; X86_64-NEXT:  # %bb.1: # %bb14
 ; X86_64-NEXT:  .LBB0_3: # Block address taken
 ; X86_64-NEXT:    # %exit
 ; X86_64-NEXT:  $ehgcr_0_3:
-; X86_64-NEXT:    nop
 ; X86_64-NEXT:    .seh_startepilogue
 ; X86_64-NEXT:    addq $64, %rsp
 ; X86_64-NEXT:    popq %rbp
diff --git a/llvm/test/CodeGen/X86/stack-protector.ll b/llvm/test/CodeGen/X86/stack-protector.ll
index f4f3ae4..772e776 100644
--- a/llvm/test/CodeGen/X86/stack-protector.ll
+++ b/llvm/test/CodeGen/X86/stack-protector.ll
@@ -6,6 +6,7 @@
 ; RUN: llc -mtriple=amd64-pc-openbsd < %s -o - | FileCheck --check-prefix=OPENBSD-AMD64 %s
 ; RUN: llc -mtriple=i386-pc-windows-msvc < %s -o - | FileCheck -check-prefix=MSVC-I386 %s
 ; RUN: llc -mtriple=x86_64-w64-mingw32 < %s -o - | FileCheck --check-prefix=MINGW-X64 %s
+; RUN: llc -mtriple=x86_64-pc-cygwin < %s -o - | FileCheck --check-prefix=MINGW-X64 %s
 ; RUN: llc -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck --check-prefix=IGNORE_INTRIN %s
 
 %struct.foo = type { [16 x i8] }
diff --git a/llvm/test/CodeGen/X86/swap.ll b/llvm/test/CodeGen/X86/swap.ll
index e556900..3330403 100644
--- a/llvm/test/CodeGen/X86/swap.ll
+++ b/llvm/test/CodeGen/X86/swap.ll
@@ -47,12 +47,10 @@ define dso_local void @onealloc_noreadback(ptr nocapture %a, ptr nocapture %b) l
 entry:
   %alloc = alloca [16 x i8], i8 2, align 1
   %part2 = getelementptr inbounds [16 x i8], ptr %alloc, i64 1, i64 0
-  call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %alloc)
-  call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %part2)
+  call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %alloc)
   call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 1 %alloc, ptr align 1 %a, i64 16, i1 false)
   tail call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 1 %part2, ptr align 1 %b, i64 16, i1 false)
-  call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %alloc)
-  call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %part2)
+  call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %alloc)
   ret void
 }
 
@@ -116,19 +114,16 @@ define dso_local void @onealloc_readback_1(ptr nocapture %a, ptr nocapture %b) l
 ; AA-LABEL: onealloc_readback_1:
 ; AA:       # %bb.0: # %entry
 ; AA-NEXT:    vmovups (%rsi), %xmm0
-; AA-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; AA-NEXT:    vmovups %xmm0, (%rdi)
 ; AA-NEXT:    retq
 entry:
   %alloc = alloca [16 x i8], i8 2, align 1
   %part1 = getelementptr inbounds [16 x i8], ptr %alloc, i64 1, i64 0
-  call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %part1)
-  call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %alloc)
+  call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %alloc)
   call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 1 %part1, ptr align 1 %a, i64 16, i1 false)
   call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 1 %alloc, ptr align 1 %b, i64 16, i1 false)
-  call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %part1)
   tail call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 1 %a, ptr align 1 %alloc, i64 16, i1 false)
-  call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %alloc)
+  call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %alloc)
   ret void
 }
 
@@ -145,19 +140,16 @@ define dso_local void @onealloc_readback_2(ptr nocapture %a, ptr nocapture %b) l
 ; AA-LABEL: onealloc_readback_2:
 ; AA:       # %bb.0: # %entry
 ; AA-NEXT:    vmovups (%rsi), %xmm0
-; AA-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; AA-NEXT:    vmovups %xmm0, (%rdi)
 ; AA-NEXT:    retq
 entry:
   %alloc = alloca [16 x i8], i8 2, align 1
   %part2 = getelementptr inbounds [16 x i8], ptr %alloc, i64 1, i64 0
-  call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %alloc)
-  call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %part2)
+  call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %alloc)
   call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 1 %alloc, ptr align 1 %a, i64 16, i1 false)
   call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 1 %part2, ptr align 1 %b, i64 16, i1 false)
-  call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %alloc)
   tail call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 1 %a, ptr align 1 %part2, i64 16, i1 false)
-  call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %part2)
+  call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %alloc)
   ret void
 }
 
diff --git a/llvm/test/CodeGen/X86/taildup-heapallocsite.ll b/llvm/test/CodeGen/X86/taildup-heapallocsite.ll
index 967e125..f3bef47 100644
--- a/llvm/test/CodeGen/X86/taildup-heapallocsite.ll
+++ b/llvm/test/CodeGen/X86/taildup-heapallocsite.ll
@@ -37,9 +37,11 @@ cond.end:                                         ; preds = %entry, %cond.true
 ; CHECK: testq
 ; CHECK: je
 ; CHECK: callq alloc
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[L1:.Ltmp[0-9]+]]
 ; CHECK: jmp f2 # TAILCALL
 ; CHECK: callq alloc
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[L3:.Ltmp[0-9]+]]
 ; CHECK: jmp f2 # TAILCALL
 
diff --git a/llvm/test/CodeGen/X86/ucmp.ll b/llvm/test/CodeGen/X86/ucmp.ll
index 6a52acf..7f17299 100644
--- a/llvm/test/CodeGen/X86/ucmp.ll
+++ b/llvm/test/CodeGen/X86/ucmp.ll
@@ -107,29 +107,33 @@ define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: ucmp.8.128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    cmpl %eax, 24(%ebp)
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sbbl %edx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl 16(%ebp), %ebx
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    sbbl %ecx, %eax
 ; X86-NEXT:    setb %al
-; X86-NEXT:    cmpl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %ebp, %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    movl 8(%ebp), %edi
+; X86-NEXT:    cmpl 24(%ebp), %edi
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %ebx
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/udiv_fix.ll b/llvm/test/CodeGen/X86/udiv_fix.ll
index 5b1e054..82dfeee 100644
--- a/llvm/test/CodeGen/X86/udiv_fix.ll
+++ b/llvm/test/CodeGen/X86/udiv_fix.ll
@@ -153,26 +153,28 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    subl $80, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl 16(%ebp), %edx
+; X86-NEXT:    movl 20(%ebp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    shrl %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    shldl $31, %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    shll $31, %eax
-; X86-NEXT:    movl %esp, %esi
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __udivti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    leal -4(%ebp), %esp
 ; X86-NEXT:    popl %esi
diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
index 30a7f80..3da5973 100644
--- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
@@ -194,32 +194,34 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    subl $80, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl 16(%ebp), %edx
+; X86-NEXT:    movl 20(%ebp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    shrl %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    shldl $31, %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    shll $31, %eax
-; X86-NEXT:    movl %esp, %esi
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __udivti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl $-1, %eax
 ; X86-NEXT:    movl $-1, %edx
 ; X86-NEXT:    jne .LBB4_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:  .LBB4_2:
 ; X86-NEXT:    leal -4(%ebp), %esp
diff --git a/llvm/test/CodeGen/X86/umax.ll b/llvm/test/CodeGen/X86/umax.ll
index f589d4a..7ef8599 100644
--- a/llvm/test/CodeGen/X86/umax.ll
+++ b/llvm/test/CodeGen/X86/umax.ll
@@ -232,31 +232,34 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl %ebx, %edx
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    cmpl 24(%ebp), %ebx
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    sbbl 28(%ebp), %esi
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl 52(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sbbl %ebp, %eax
-; X86-NEXT:    cmovbl %ebx, %edx
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovbl %ebp, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    cmovbl 24(%ebp), %ebx
+; X86-NEXT:    cmovbl 28(%ebp), %edi
+; X86-NEXT:    cmovbl 32(%ebp), %edx
+; X86-NEXT:    cmovbl %esi, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -282,37 +285,40 @@ define i128 @test_i128_1(i128 %a) nounwind {
 ; X86-LABEL: test_i128_1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
 ; X86-NEXT:    cmpl $1, %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    testl %edx, %edx
-; X86-NEXT:    movl $1, %edi
-; X86-NEXT:    cmovnel %eax, %edi
-; X86-NEXT:    cmovel %ebx, %edi
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    negl %ebp
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl $1, %ebp
-; X86-NEXT:    cmovbl %eax, %ebp
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    cmovbl %edx, %ebx
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    cmovel %edi, %ebp
-; X86-NEXT:    cmovel %edx, %ebx
-; X86-NEXT:    movl %ebx, 4(%eax)
-; X86-NEXT:    movl %ebp, (%eax)
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    cmpl $0, 28(%ebp)
+; X86-NEXT:    movl $1, %esi
+; X86-NEXT:    cmovnel %eax, %esi
+; X86-NEXT:    cmovel %ecx, %esi
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ecx, %ebx
+; X86-NEXT:    movl $1, %ebx
+; X86-NEXT:    cmovbl %eax, %ebx
+; X86-NEXT:    cmovbl 28(%ebp), %edi
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    cmovel %esi, %ebx
+; X86-NEXT:    cmovel 28(%ebp), %edi
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1312,29 +1318,32 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind {
 ;
 ; X86-LABEL: test_signbits_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    shrdl $28, %edi, %ecx
-; X86-NEXT:    sarl $28, %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    shrdl $28, %edx, %ecx
+; X86-NEXT:    sarl $28, %edx
 ; X86-NEXT:    cmpl %esi, %ecx
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    cmovbl %esi, %ecx
-; X86-NEXT:    cmovbl %edx, %edi
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    cmovbl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %ax = ashr i128 %a, 64
   %bx = ashr i128 %b, 92
diff --git a/llvm/test/CodeGen/X86/umin.ll b/llvm/test/CodeGen/X86/umin.ll
index 7a5cdbb..c927abf 100644
--- a/llvm/test/CodeGen/X86/umin.ll
+++ b/llvm/test/CodeGen/X86/umin.ll
@@ -147,32 +147,34 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl %edx, %ebx
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    cmpl %ecx, 24(%ebp)
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    sbbl %esi, %ebx
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %edi
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    sbbl %ebp, %eax
-; X86-NEXT:    cmovbl %ebx, %edx
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmovbl %edi, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebp, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    cmovbl 24(%ebp), %ecx
+; X86-NEXT:    cmovbl 28(%ebp), %edx
+; X86-NEXT:    cmovbl 32(%ebp), %esi
+; X86-NEXT:    cmovbl %edi, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -727,29 +729,32 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind {
 ;
 ; X86-LABEL: test_signbits_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    shrdl $28, %edi, %ecx
-; X86-NEXT:    sarl $28, %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    shrdl $28, %edx, %ecx
+; X86-NEXT:    sarl $28, %edx
 ; X86-NEXT:    cmpl %ecx, %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    sbbl %edx, %edi
 ; X86-NEXT:    cmovbl %esi, %ecx
-; X86-NEXT:    cmovbl %edx, %edi
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    cmovbl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %ax = ashr i128 %a, 64
   %bx = ashr i128 %b, 92
diff --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
index 4c31703..89afd1b 100644
--- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
@@ -38,8 +38,8 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $24, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 44
+; X86-NEXT:    subl $28, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 48
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
@@ -147,7 +147,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    andb $1, %al
 ; X86-NEXT:    movb %al, 16(%ecx)
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 20
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16
diff --git a/llvm/test/CodeGen/X86/vec_extract.ll b/llvm/test/CodeGen/X86/vec_extract.ll
index 087cd30..9bd38db 100644
--- a/llvm/test/CodeGen/X86/vec_extract.ll
+++ b/llvm/test/CodeGen/X86/vec_extract.ll
@@ -104,6 +104,72 @@ entry:
 }
 declare <2 x double> @foo()
 
+define i64 @pr150117(<31 x i8> %a0) nounwind {
+; X86-LABEL: pr150117:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shll $8, %edx
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    shll $8, %edi
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    shll $16, %ecx
+; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    shll $24, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    movd %esi, %xmm0
+; X86-NEXT:    pinsrw $2, %edx, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $8, %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    pinsrw $3, %ecx, %xmm0
+; X86-NEXT:    movd %xmm0, %eax
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT:    movd %xmm0, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X64-LABEL: pr150117:
+; X64:       # %bb.0:
+; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
+; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
+; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
+; X64-NEXT:    movl {{[0-9]+}}(%rsp), %r8d
+; X64-NEXT:    shll $8, %r8d
+; X64-NEXT:    orl %edi, %r8d
+; X64-NEXT:    shll $8, %esi
+; X64-NEXT:    orl %edx, %esi
+; X64-NEXT:    shll $16, %ecx
+; X64-NEXT:    orl %esi, %ecx
+; X64-NEXT:    movl {{[0-9]+}}(%rsp), %edx
+; X64-NEXT:    shll $24, %edx
+; X64-NEXT:    orl %ecx, %edx
+; X64-NEXT:    movd %edx, %xmm0
+; X64-NEXT:    pinsrw $2, %r8d, %xmm0
+; X64-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; X64-NEXT:    shll $8, %ecx
+; X64-NEXT:    orl %eax, %ecx
+; X64-NEXT:    pinsrw $3, %ecx, %xmm0
+; X64-NEXT:    movq %xmm0, %rax
+; X64-NEXT:    retq
+  %shuffle = shufflevector <31 x i8> %a0, <31 x i8> zeroinitializer, <32 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bitcast = bitcast <32 x i8> %shuffle to <4 x i64>
+  %elt = extractelement <4 x i64> %bitcast, i64 0
+  ret i64 %elt
+}
+
 ; OSS-Fuzz #15662
 ; https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=15662
 define <4 x i32> @ossfuzz15662(ptr %in) {
diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll
index 5dcf190..834dfd6 100644
--- a/llvm/test/CodeGen/X86/vector-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll
@@ -8,7 +8,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512BW
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=ALL,XOP,XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=ALL,XOP,XOPAVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+gfni | FileCheck %s --check-prefixes=ALL,GFNISSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+gfni | FileCheck %s --check-prefixes=ALL,GFNISSE,GFNISSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+gfni | FileCheck %s --check-prefixes=ALL,GFNISSE,GFNISSSE3
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX512,GFNIAVX512F
@@ -492,11 +493,20 @@ define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
 ; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
-; GFNISSE-LABEL: test_bitreverse_v8i16:
-; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT:    retq
+; GFNISSE2-LABEL: test_bitreverse_v8i16:
+; GFNISSE2:       # %bb.0:
+; GFNISSE2-NEXT:    movdqa %xmm0, %xmm1
+; GFNISSE2-NEXT:    psrlw $8, %xmm1
+; GFNISSE2-NEXT:    psllw $8, %xmm0
+; GFNISSE2-NEXT:    por %xmm1, %xmm0
+; GFNISSE2-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; GFNISSE2-NEXT:    retq
+;
+; GFNISSSE3-LABEL: test_bitreverse_v8i16:
+; GFNISSSE3:       # %bb.0:
+; GFNISSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; GFNISSSE3-NEXT:    retq
 ;
 ; GFNIAVX-LABEL: test_bitreverse_v8i16:
 ; GFNIAVX:       # %bb.0:
@@ -605,11 +615,25 @@ define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
 ; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
-; GFNISSE-LABEL: test_bitreverse_v4i32:
-; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT:    retq
+; GFNISSE2-LABEL: test_bitreverse_v4i32:
+; GFNISSE2:       # %bb.0:
+; GFNISSE2-NEXT:    pxor %xmm1, %xmm1
+; GFNISSE2-NEXT:    movdqa %xmm0, %xmm2
+; GFNISSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    packuswb %xmm2, %xmm0
+; GFNISSE2-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; GFNISSE2-NEXT:    retq
+;
+; GFNISSSE3-LABEL: test_bitreverse_v4i32:
+; GFNISSSE3:       # %bb.0:
+; GFNISSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; GFNISSSE3-NEXT:    retq
 ;
 ; GFNIAVX-LABEL: test_bitreverse_v4i32:
 ; GFNIAVX:       # %bb.0:
@@ -720,11 +744,27 @@ define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
 ; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
-; GFNISSE-LABEL: test_bitreverse_v2i64:
-; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
-; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT:    retq
+; GFNISSE2-LABEL: test_bitreverse_v2i64:
+; GFNISSE2:       # %bb.0:
+; GFNISSE2-NEXT:    pxor %xmm1, %xmm1
+; GFNISSE2-NEXT:    movdqa %xmm0, %xmm2
+; GFNISSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; GFNISSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; GFNISSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    packuswb %xmm2, %xmm0
+; GFNISSE2-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; GFNISSE2-NEXT:    retq
+;
+; GFNISSSE3-LABEL: test_bitreverse_v2i64:
+; GFNISSSE3:       # %bb.0:
+; GFNISSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; GFNISSSE3-NEXT:    retq
 ;
 ; GFNIAVX-LABEL: test_bitreverse_v2i64:
 ; GFNIAVX:       # %bb.0:
@@ -1042,15 +1082,30 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
-; GFNISSE-LABEL: test_bitreverse_v16i16:
-; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; GFNISSE-NEXT:    pshufb %xmm2, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
-; GFNISSE-NEXT:    pshufb %xmm2, %xmm1
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
-; GFNISSE-NEXT:    retq
+; GFNISSE2-LABEL: test_bitreverse_v16i16:
+; GFNISSE2:       # %bb.0:
+; GFNISSE2-NEXT:    movdqa %xmm0, %xmm2
+; GFNISSE2-NEXT:    psrlw $8, %xmm2
+; GFNISSE2-NEXT:    psllw $8, %xmm0
+; GFNISSE2-NEXT:    por %xmm2, %xmm0
+; GFNISSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm2, %xmm0
+; GFNISSE2-NEXT:    movdqa %xmm1, %xmm3
+; GFNISSE2-NEXT:    psrlw $8, %xmm3
+; GFNISSE2-NEXT:    psllw $8, %xmm1
+; GFNISSE2-NEXT:    por %xmm3, %xmm1
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm2, %xmm1
+; GFNISSE2-NEXT:    retq
+;
+; GFNISSSE3-LABEL: test_bitreverse_v16i16:
+; GFNISSSE3:       # %bb.0:
+; GFNISSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; GFNISSSE3-NEXT:    pshufb %xmm2, %xmm0
+; GFNISSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
+; GFNISSSE3-NEXT:    pshufb %xmm2, %xmm1
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
+; GFNISSSE3-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: test_bitreverse_v16i16:
 ; GFNIAVX1:       # %bb.0:
@@ -1241,15 +1296,39 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
-; GFNISSE-LABEL: test_bitreverse_v8i32:
-; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; GFNISSE-NEXT:    pshufb %xmm2, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
-; GFNISSE-NEXT:    pshufb %xmm2, %xmm1
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
-; GFNISSE-NEXT:    retq
+; GFNISSE2-LABEL: test_bitreverse_v8i32:
+; GFNISSE2:       # %bb.0:
+; GFNISSE2-NEXT:    pxor %xmm2, %xmm2
+; GFNISSE2-NEXT:    movdqa %xmm0, %xmm3
+; GFNISSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    packuswb %xmm3, %xmm0
+; GFNISSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
+; GFNISSE2-NEXT:    movdqa %xmm1, %xmm4
+; GFNISSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    packuswb %xmm4, %xmm1
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
+; GFNISSE2-NEXT:    retq
+;
+; GFNISSSE3-LABEL: test_bitreverse_v8i32:
+; GFNISSSE3:       # %bb.0:
+; GFNISSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; GFNISSSE3-NEXT:    pshufb %xmm2, %xmm0
+; GFNISSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
+; GFNISSSE3-NEXT:    pshufb %xmm2, %xmm1
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
+; GFNISSSE3-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: test_bitreverse_v8i32:
 ; GFNIAVX1:       # %bb.0:
@@ -1444,15 +1523,43 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
-; GFNISSE-LABEL: test_bitreverse_v4i64:
-; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
-; GFNISSE-NEXT:    pshufb %xmm2, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
-; GFNISSE-NEXT:    pshufb %xmm2, %xmm1
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
-; GFNISSE-NEXT:    retq
+; GFNISSE2-LABEL: test_bitreverse_v4i64:
+; GFNISSE2:       # %bb.0:
+; GFNISSE2-NEXT:    pxor %xmm2, %xmm2
+; GFNISSE2-NEXT:    movdqa %xmm0, %xmm3
+; GFNISSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; GFNISSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; GFNISSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    packuswb %xmm3, %xmm0
+; GFNISSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
+; GFNISSE2-NEXT:    movdqa %xmm1, %xmm4
+; GFNISSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; GFNISSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; GFNISSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    packuswb %xmm4, %xmm1
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
+; GFNISSE2-NEXT:    retq
+;
+; GFNISSSE3-LABEL: test_bitreverse_v4i64:
+; GFNISSSE3:       # %bb.0:
+; GFNISSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; GFNISSSE3-NEXT:    pshufb %xmm2, %xmm0
+; GFNISSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
+; GFNISSSE3-NEXT:    pshufb %xmm2, %xmm1
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
+; GFNISSSE3-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: test_bitreverse_v4i64:
 ; GFNIAVX1:       # %bb.0:
@@ -2035,19 +2142,44 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; XOPAVX2-NEXT:    retq
 ;
-; GFNISSE-LABEL: test_bitreverse_v32i16:
-; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; GFNISSE-NEXT:    pshufb %xmm4, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
-; GFNISSE-NEXT:    pshufb %xmm4, %xmm1
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
-; GFNISSE-NEXT:    pshufb %xmm4, %xmm2
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
-; GFNISSE-NEXT:    pshufb %xmm4, %xmm3
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
-; GFNISSE-NEXT:    retq
+; GFNISSE2-LABEL: test_bitreverse_v32i16:
+; GFNISSE2:       # %bb.0:
+; GFNISSE2-NEXT:    movdqa %xmm0, %xmm4
+; GFNISSE2-NEXT:    psrlw $8, %xmm4
+; GFNISSE2-NEXT:    psllw $8, %xmm0
+; GFNISSE2-NEXT:    por %xmm4, %xmm0
+; GFNISSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm4, %xmm0
+; GFNISSE2-NEXT:    movdqa %xmm1, %xmm5
+; GFNISSE2-NEXT:    psrlw $8, %xmm5
+; GFNISSE2-NEXT:    psllw $8, %xmm1
+; GFNISSE2-NEXT:    por %xmm5, %xmm1
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm4, %xmm1
+; GFNISSE2-NEXT:    movdqa %xmm2, %xmm5
+; GFNISSE2-NEXT:    psrlw $8, %xmm5
+; GFNISSE2-NEXT:    psllw $8, %xmm2
+; GFNISSE2-NEXT:    por %xmm5, %xmm2
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm4, %xmm2
+; GFNISSE2-NEXT:    movdqa %xmm3, %xmm5
+; GFNISSE2-NEXT:    psrlw $8, %xmm5
+; GFNISSE2-NEXT:    psllw $8, %xmm3
+; GFNISSE2-NEXT:    por %xmm5, %xmm3
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm4, %xmm3
+; GFNISSE2-NEXT:    retq
+;
+; GFNISSSE3-LABEL: test_bitreverse_v32i16:
+; GFNISSSE3:       # %bb.0:
+; GFNISSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; GFNISSSE3-NEXT:    pshufb %xmm4, %xmm0
+; GFNISSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
+; GFNISSSE3-NEXT:    pshufb %xmm4, %xmm1
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
+; GFNISSSE3-NEXT:    pshufb %xmm4, %xmm2
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
+; GFNISSSE3-NEXT:    pshufb %xmm4, %xmm3
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
+; GFNISSSE3-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: test_bitreverse_v32i16:
 ; GFNIAVX1:       # %bb.0:
@@ -2393,19 +2525,61 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; XOPAVX2-NEXT:    retq
 ;
-; GFNISSE-LABEL: test_bitreverse_v16i32:
-; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; GFNISSE-NEXT:    pshufb %xmm4, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
-; GFNISSE-NEXT:    pshufb %xmm4, %xmm1
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
-; GFNISSE-NEXT:    pshufb %xmm4, %xmm2
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
-; GFNISSE-NEXT:    pshufb %xmm4, %xmm3
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
-; GFNISSE-NEXT:    retq
+; GFNISSE2-LABEL: test_bitreverse_v16i32:
+; GFNISSE2:       # %bb.0:
+; GFNISSE2-NEXT:    pxor %xmm4, %xmm4
+; GFNISSE2-NEXT:    movdqa %xmm0, %xmm5
+; GFNISSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    packuswb %xmm5, %xmm0
+; GFNISSE2-NEXT:    movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
+; GFNISSE2-NEXT:    movdqa %xmm1, %xmm6
+; GFNISSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    packuswb %xmm6, %xmm1
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
+; GFNISSE2-NEXT:    movdqa %xmm2, %xmm6
+; GFNISSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    packuswb %xmm6, %xmm2
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
+; GFNISSE2-NEXT:    movdqa %xmm3, %xmm6
+; GFNISSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    packuswb %xmm6, %xmm3
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
+; GFNISSE2-NEXT:    retq
+;
+; GFNISSSE3-LABEL: test_bitreverse_v16i32:
+; GFNISSSE3:       # %bb.0:
+; GFNISSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; GFNISSSE3-NEXT:    pshufb %xmm4, %xmm0
+; GFNISSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
+; GFNISSSE3-NEXT:    pshufb %xmm4, %xmm1
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
+; GFNISSSE3-NEXT:    pshufb %xmm4, %xmm2
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
+; GFNISSSE3-NEXT:    pshufb %xmm4, %xmm3
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
+; GFNISSSE3-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: test_bitreverse_v16i32:
 ; GFNIAVX1:       # %bb.0:
@@ -2759,19 +2933,69 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; XOPAVX2-NEXT:    retq
 ;
-; GFNISSE-LABEL: test_bitreverse_v8i64:
-; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
-; GFNISSE-NEXT:    pshufb %xmm4, %xmm0
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
-; GFNISSE-NEXT:    pshufb %xmm4, %xmm1
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
-; GFNISSE-NEXT:    pshufb %xmm4, %xmm2
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
-; GFNISSE-NEXT:    pshufb %xmm4, %xmm3
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
-; GFNISSE-NEXT:    retq
+; GFNISSE2-LABEL: test_bitreverse_v8i64:
+; GFNISSE2:       # %bb.0:
+; GFNISSE2-NEXT:    pxor %xmm4, %xmm4
+; GFNISSE2-NEXT:    movdqa %xmm0, %xmm5
+; GFNISSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; GFNISSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; GFNISSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    packuswb %xmm5, %xmm0
+; GFNISSE2-NEXT:    movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
+; GFNISSE2-NEXT:    movdqa %xmm1, %xmm6
+; GFNISSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; GFNISSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; GFNISSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    packuswb %xmm6, %xmm1
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
+; GFNISSE2-NEXT:    movdqa %xmm2, %xmm6
+; GFNISSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; GFNISSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; GFNISSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    packuswb %xmm6, %xmm2
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
+; GFNISSE2-NEXT:    movdqa %xmm3, %xmm6
+; GFNISSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; GFNISSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; GFNISSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; GFNISSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT:    packuswb %xmm6, %xmm3
+; GFNISSE2-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
+; GFNISSE2-NEXT:    retq
+;
+; GFNISSSE3-LABEL: test_bitreverse_v8i64:
+; GFNISSSE3:       # %bb.0:
+; GFNISSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; GFNISSSE3-NEXT:    pshufb %xmm4, %xmm0
+; GFNISSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
+; GFNISSSE3-NEXT:    pshufb %xmm4, %xmm1
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
+; GFNISSSE3-NEXT:    pshufb %xmm4, %xmm2
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
+; GFNISSSE3-NEXT:    pshufb %xmm4, %xmm3
+; GFNISSSE3-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
+; GFNISSSE3-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: test_bitreverse_v8i64:
 ; GFNIAVX1:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/wide-integer-cmp.ll b/llvm/test/CodeGen/X86/wide-integer-cmp.ll
index a15d633..12dccca 100644
--- a/llvm/test/CodeGen/X86/wide-integer-cmp.ll
+++ b/llvm/test/CodeGen/X86/wide-integer-cmp.ll
@@ -92,6 +92,8 @@ define i32 @test_wide(i128 %a, i128 %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    subl $8, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset %esi, -8
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -101,15 +103,15 @@ define i32 @test_wide(i128 %a, i128 %b) {
 ; CHECK-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
 ; CHECK-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    jge .LBB4_2
+; CHECK-NEXT:    jge .LBB4_3
 ; CHECK-NEXT:  # %bb.1: # %bb1
 ; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    popl %esi
-; CHECK-NEXT:    .cfi_def_cfa_offset 4
-; CHECK-NEXT:    retl
-; CHECK-NEXT:  .LBB4_2: # %bb2
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    jmp .LBB4_2
+; CHECK-NEXT:  .LBB4_3: # %bb2
 ; CHECK-NEXT:    movl $2, %eax
+; CHECK-NEXT:  .LBB4_2: # %bb1
+; CHECK-NEXT:    addl $8, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    .cfi_def_cfa_offset 4
 ; CHECK-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/win-catchpad-nested-cxx.ll b/llvm/test/CodeGen/X86/win-catchpad-nested-cxx.ll
index bfb9c43..0bf8370 100644
--- a/llvm/test/CodeGen/X86/win-catchpad-nested-cxx.ll
+++ b/llvm/test/CodeGen/X86/win-catchpad-nested-cxx.ll
@@ -103,15 +103,15 @@ handler2:
 ; X64: $ip2state$try_in_catch:
 ; X64-NEXT: .long   .Lfunc_begin0@IMGREL
 ; X64-NEXT: .long   -1
-; X64-NEXT: .long   .Ltmp0@IMGREL+1
+; X64-NEXT: .long   .Ltmp0@IMGREL
 ; X64-NEXT: .long   0
-; X64-NEXT: .long   .Ltmp1@IMGREL+1
+; X64-NEXT: .long   .Ltmp1@IMGREL
 ; X64-NEXT: .long   -1
 ; X64-NEXT: .long   "?catch$2@?0?try_in_catch@4HA"@IMGREL
 ; X64-NEXT: .long   1
-; X64-NEXT: .long   .Ltmp2@IMGREL+1
+; X64-NEXT: .long   .Ltmp2@IMGREL
 ; X64-NEXT: .long   2
-; X64-NEXT: .long   .Ltmp3@IMGREL+1
+; X64-NEXT: .long   .Ltmp3@IMGREL
 ; X64-NEXT: .long   1
 ; X64-NEXT: .long   "?catch$4@?0?try_in_catch@4HA"@IMGREL
 ; X64-NEXT: .long   3
diff --git a/llvm/test/CodeGen/X86/win-catchpad.ll b/llvm/test/CodeGen/X86/win-catchpad.ll
index 2491946..62ea510 100644
--- a/llvm/test/CodeGen/X86/win-catchpad.ll
+++ b/llvm/test/CodeGen/X86/win-catchpad.ll
@@ -214,9 +214,9 @@ try.cont:
 ; X64: $ip2state$try_catch_catch:
 ; X64-NEXT: .long   .Lfunc_begin0@IMGREL
 ; X64-NEXT: .long   -1
-; X64-NEXT: .long   .Ltmp0@IMGREL+1
+; X64-NEXT: .long   .Ltmp0@IMGREL
 ; X64-NEXT: .long   0
-; X64-NEXT: .long   .Ltmp1@IMGREL+1
+; X64-NEXT: .long   .Ltmp1@IMGREL
 ; X64-NEXT: .long   -1
 ; X64-NEXT: .long   "?catch$[[catch1bb]]@?0?try_catch_catch@4HA"@IMGREL
 ; X64-NEXT: .long   1
@@ -357,9 +357,9 @@ try.cont:
 ; X64-LABEL: $ip2state$branch_to_normal_dest:
 ; X64-NEXT: .long   .Lfunc_begin1@IMGREL
 ; X64-NEXT: .long   -1
-; X64-NEXT: .long   .Ltmp[[before_call]]@IMGREL+1
+; X64-NEXT: .long   .Ltmp[[before_call]]@IMGREL
 ; X64-NEXT: .long   0
-; X64-NEXT: .long   .Ltmp[[after_call]]@IMGREL+1
+; X64-NEXT: .long   .Ltmp[[after_call]]@IMGREL
 ; X64-NEXT: .long   -1
 ; X64-NEXT: .long   "?catch$[[catchbb]]@?0?branch_to_normal_dest@4HA"@IMGREL
 ; X64-NEXT: .long   1
diff --git a/llvm/test/CodeGen/X86/win-cleanuppad.ll b/llvm/test/CodeGen/X86/win-cleanuppad.ll
index e3f7f5b..e9265a1 100644
--- a/llvm/test/CodeGen/X86/win-cleanuppad.ll
+++ b/llvm/test/CodeGen/X86/win-cleanuppad.ll
@@ -191,7 +191,7 @@ cleanup.outer:                                      ; preds = %invoke.cont.1, %c
 ; X64-NEXT: .long   1
 ; X64-NEXT: .long   .Ltmp6@IMGREL
 ; X64-NEXT: .long   0
-; X64-NEXT: .long   .Ltmp7@IMGREL+1
+; X64-NEXT: .long   .Ltmp7@IMGREL
 ; X64-NEXT: .long   -1
 
 attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/win32-eh-states.ll b/llvm/test/CodeGen/X86/win32-eh-states.ll
index 42ae5b0..e645199 100644
--- a/llvm/test/CodeGen/X86/win32-eh-states.ll
+++ b/llvm/test/CodeGen/X86/win32-eh-states.ll
@@ -86,11 +86,11 @@ catch.7:
 ; X64-LABEL: $ip2state$f:
 ; X64-NEXT:   .long .Lfunc_begin0@IMGREL
 ; X64-NEXT:   .long -1
-; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL
 ; X64-NEXT:   .long 0
-; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL
 ; X64-NEXT:   .long 1
-; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL
 ; X64-NEXT:   .long -1
 ; X64-NEXT:   .long "?catch${{.*}}@?0?f@4HA"@IMGREL
 ; X64-NEXT:   .long 2
@@ -189,15 +189,15 @@ unreachable:                                      ; preds = %entry
 ; X64-LABEL: $ip2state$g:
 ; X64-NEXT:   .long .Lfunc_begin1@IMGREL
 ; X64-NEXT:   .long -1
-; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL
 ; X64-NEXT:   .long 1
-; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL
 ; X64-NEXT:   .long -1
 ; X64-NEXT:   .long "?catch${{.*}}@?0?g@4HA"@IMGREL
 ; X64-NEXT:   .long 2
-; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL
 ; X64-NEXT:   .long 3
-; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL+1
+; X64-NEXT:   .long .Ltmp{{.*}}@IMGREL
 ; X64-NEXT:   .long 2
 
 
diff --git a/llvm/test/CodeGen/X86/win32-int-runtime-libcalls.ll b/llvm/test/CodeGen/X86/win32-int-runtime-libcalls.ll
new file mode 100644
index 0000000..5ac90a0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/win32-int-runtime-libcalls.ll
@@ -0,0 +1,113 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck -check-prefix=CHECK32 %s
+; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck -check-prefix=CHECK64 %s
+
+define i64 @test_sdiv_i64(i64 %a, i64 %b) {
+; CHECK32-LABEL: test_sdiv_i64:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    calll __alldiv
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: test_sdiv_i64:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movq %rdx, %r8
+; CHECK64-NEXT:    movq %rcx, %rax
+; CHECK64-NEXT:    cqto
+; CHECK64-NEXT:    idivq %r8
+; CHECK64-NEXT:    retq
+  %ret = sdiv i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @test_srem_i64(i64 %a, i64 %b) {
+; CHECK32-LABEL: test_srem_i64:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    calll __allrem
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: test_srem_i64:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movq %rdx, %r8
+; CHECK64-NEXT:    movq %rcx, %rax
+; CHECK64-NEXT:    cqto
+; CHECK64-NEXT:    idivq %r8
+; CHECK64-NEXT:    movq %rdx, %rax
+; CHECK64-NEXT:    retq
+  %ret = srem i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @test_udiv_i64(i64 %a, i64 %b) {
+; CHECK32-LABEL: test_udiv_i64:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    calll __aulldiv
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: test_udiv_i64:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movq %rdx, %r8
+; CHECK64-NEXT:    movq %rcx, %rax
+; CHECK64-NEXT:    xorl %edx, %edx
+; CHECK64-NEXT:    divq %r8
+; CHECK64-NEXT:    retq
+  %ret = udiv i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @test_urem_i64(i64 %a, i64 %b) {
+; CHECK32-LABEL: test_urem_i64:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    calll __aullrem
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: test_urem_i64:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movq %rdx, %r8
+; CHECK64-NEXT:    movq %rcx, %rax
+; CHECK64-NEXT:    xorl %edx, %edx
+; CHECK64-NEXT:    divq %r8
+; CHECK64-NEXT:    movq %rdx, %rax
+; CHECK64-NEXT:    retq
+  %ret = urem i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @test_mul_i64(i64 %a, i64 %b) {
+; CHECK32-LABEL: test_mul_i64:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    movl %ecx, %eax
+; CHECK32-NEXT:    mull %esi
+; CHECK32-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    addl %ecx, %edx
+; CHECK32-NEXT:    imull {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    addl %esi, %edx
+; CHECK32-NEXT:    popl %esi
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: test_mul_i64:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movq %rcx, %rax
+; CHECK64-NEXT:    imulq %rdx, %rax
+; CHECK64-NEXT:    retq
+  %ret = mul i64 %a, %b
+  ret i64 %ret
+}
diff --git a/llvm/test/CodeGen/X86/win32-ssp.ll b/llvm/test/CodeGen/X86/win32-ssp.ll
index 536a6d5..259f039 100644
--- a/llvm/test/CodeGen/X86/win32-ssp.ll
+++ b/llvm/test/CodeGen/X86/win32-ssp.ll
@@ -1,7 +1,9 @@
 ; RUN: llc -mtriple=x86_64-w64-mingw32        < %s -o - | FileCheck --check-prefix=MINGW %s
+; RUN: llc -mtriple=x86_64-pc-cygwin          < %s -o - | FileCheck --check-prefix=MINGW %s
 ; RUN: llc -mtriple=x86_64-pc-windows-itanium < %s -o - | FileCheck --check-prefix=MSVC  %s
 ; RUN: llc -mtriple=x86_64-pc-windows-msvc    < %s -o - | FileCheck --check-prefix=MSVC  %s
 ; RUN: llc -mtriple=i686-w64-mingw32          < %s -o - | FileCheck --check-prefix=MINGW %s
+; RUN: llc -mtriple=i686-pc-cygwin            < %s -o - | FileCheck --check-prefix=MINGW %s
 
 declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 declare dso_local void @other(ptr)
diff --git a/llvm/test/CodeGen/X86/win64-seh-epilogue-statepoint.ll b/llvm/test/CodeGen/X86/win64-seh-epilogue-statepoint.ll
index bc5be7a..75f156f 100644
--- a/llvm/test/CodeGen/X86/win64-seh-epilogue-statepoint.ll
+++ b/llvm/test/CodeGen/X86/win64-seh-epilogue-statepoint.ll
@@ -8,8 +8,8 @@ define i32 @foobar() gc "statepoint-example" personality ptr @__gxx_personality_
 ; CHECK-NEXT:    .seh_stackalloc 40
 ; CHECK-NEXT:    .seh_endprologue
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .Ltmp0:
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:  .Ltmp0:
 ; CHECK-NEXT:    .seh_startepilogue
 ; CHECK-NEXT:    addq $40, %rsp
 ; CHECK-NEXT:    .seh_endepilogue
diff --git a/llvm/test/CodeGen/X86/wineh-coreclr.ll b/llvm/test/CodeGen/X86/wineh-coreclr.ll
index baf5eaa..a3d0fde 100644
--- a/llvm/test/CodeGen/X86/wineh-coreclr.ll
+++ b/llvm/test/CodeGen/X86/wineh-coreclr.ll
@@ -38,6 +38,7 @@ entry:
 ; CHECK: [[test1_before_f1:.+]]:
 ; CHECK-NEXT: movl $1, %ecx
 ; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[test1_after_f1:.+]]:
   invoke void @f(i32 1)
     to label %inner_try unwind label %finally
@@ -46,6 +47,7 @@ inner_try:
 ; CHECK: [[test1_before_f2:.+]]:
 ; CHECK-NEXT: movl $2, %ecx
 ; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[test1_after_f2:.+]]:
   invoke void @f(i32 2)
     to label %finally.clone unwind label %exn.dispatch
@@ -69,6 +71,7 @@ catch1:
 ; CHECK: [[test1_before_f3:.+]]:
 ; CHECK-NEXT: movl $3, %ecx
 ; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[test1_after_f3:.+]]:
   invoke void @f(i32 3) [ "funclet"(token %catch.pad1) ]
     to label %catch1.ret unwind label %finally
@@ -92,6 +95,7 @@ catch2:
 ; CHECK: [[test1_before_f4:.+]]:
 ; CHECK-NEXT: movl $4, %ecx
 ; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[test1_after_f4:.+]]:
   invoke void @f(i32 4) [ "funclet"(token %catch.pad2) ]
     to label %try_in_catch unwind label %finally
@@ -100,6 +104,7 @@ try_in_catch:
 ; CHECK: [[test1_before_f5:.+]]:
 ; CHECK-NEXT: movl $5, %ecx
 ; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[test1_after_f5:.+]]:
   invoke void @f(i32 5) [ "funclet"(token %catch.pad2) ]
     to label %catch2.ret unwind label %fault
@@ -116,6 +121,7 @@ fault:
 ; CHECK: [[test1_before_f6:.+]]:
 ; CHECK-NEXT: movl $6, %ecx
 ; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[test1_after_f6:.+]]:
   invoke void @f(i32 6) [ "funclet"(token %fault.pad) ]
     to label %fault.ret unwind label %finally
@@ -312,6 +318,7 @@ unreachable:
 ; CHECK: [[test2_before_f1:.+]]:
 ; CHECK-NEXT: movl $1, %ecx
 ; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[test2_after_f1:.+]]:
 ; CHECK: .seh_proc [[test2_catch1:[^ ]+]]
 ; CHECK: .seh_proc [[test2_catch2:[^ ]+]]
@@ -320,6 +327,7 @@ unreachable:
 ; CHECK: [[test2_before_f2:.+]]:
 ; CHECK-NEXT: movl $2, %ecx
 ; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[test2_after_f2:.+]]:
 ; CHECK: int3
 ; CHECK: [[test2_end:.*func_end.*]]:
@@ -448,6 +456,7 @@ entry:
 ; CHECK: [[test3_before_f1:.+]]:
 ; CHECK-NEXT: movl $1, %ecx
 ; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[test3_after_f1:.+]]:
   invoke void @f(i32 1)
     to label %exit unwind label %fault1
@@ -474,6 +483,7 @@ fault4:
 ; CHECK: [[test3_before_f6:.+]]:
 ; CHECK-NEXT: movl $6, %ecx
 ; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[test3_after_f6:.+]]:
   invoke void @f(i32 6) ["funclet"(token %fault.pad4)]
     to label %fault4.cont unwind label %exn.dispatch1
@@ -482,6 +492,7 @@ fault4.cont:
 ; CHECK: [[test3_before_f7:.+]]:
 ; CHECK-NEXT: movl $7, %ecx
 ; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[test3_after_f7:.+]]:
   invoke void @f(i32 7) ["funclet"(token %fault.pad4)]
     to label %unreachable unwind label %fault5
@@ -512,6 +523,7 @@ unreachable:
 ; CHECK: [[test3_before_f4:.+]]:
 ; CHECK-NEXT: movl $4, %ecx
 ; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[test3_after_f4:.+]]:
 ; CHECK: int3
 ; CHECK: .seh_proc [[test3_fault2:[^ ]+]]
@@ -520,6 +532,7 @@ unreachable:
 ; CHECK: [[test3_before_f3:.+]]:
 ; CHECK-NEXT: movl $3, %ecx
 ; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[test3_after_f3:.+]]:
 ; CHECK: int3
 ; CHECK: .seh_proc [[test3_fault1:[^ ]+]]
@@ -528,6 +541,7 @@ unreachable:
 ; CHECK: [[test3_before_f2:.+]]:
 ; CHECK-NEXT: movl $2, %ecx
 ; CHECK-NEXT: callq f
+; CHECK-NEXT: nop
 ; CHECK-NEXT: [[test3_after_f2:.+]]:
 ; CHECK: int3
 ; CHECK: [[test3_end:.*func_end.*]]: