110 files changed, 6539 insertions, 655 deletions
diff --git a/llvm/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll b/llvm/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll
index 8d690ba..6541693 100644
--- a/llvm/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll
+++ b/llvm/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll
@@ -13,25 +13,24 @@ define fastcc void @mp_sqrt(i32 %n, i32 %radix, ptr %in, ptr %out, ptr %tmp1, pt
 ; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    pushl %eax
-; CHECK-NEXT:    movb $1, %cl
+; CHECK-NEXT:    movb $1, %al
 ; CHECK-NEXT:    movl $1, %ebx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB0_1: # %bb.i5
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    addl %ebx, %ebx
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb $1, %cl
 ; CHECK-NEXT:    jne .LBB0_1
 ; CHECK-NEXT:  # %bb.2: # %mp_unexp_mp2d.exit.i
 ; CHECK-NEXT:    je .LBB0_3
 ; CHECK-NEXT:  # %bb.5: # %cond_next.i
-; CHECK-NEXT:    testb $1, %al
 ; CHECK-NEXT:    jne .LBB0_3
 ; CHECK-NEXT:  # %bb.6: # %cond_next36.i
 ; CHECK-NEXT:    movl $0, 0
-; CHECK-NEXT:    movzbl %al, %ebp
+; CHECK-NEXT:    movzbl %cl, %ebp
 ; CHECK-NEXT:    andl $1, %ebp
 ; CHECK-NEXT:    xorpd %xmm0, %xmm0
 ; CHECK-NEXT:    xorl %eax, %eax
diff --git a/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll b/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
index bf939c4..3913e93 100644
--- a/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
+++ b/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
@@ -38,7 +38,6 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
 ; CHECK-NEXT:  ## %bb.1: ## %bb116.i
 ; CHECK-NEXT:    je LBB0_25
 ; CHECK-NEXT:  ## %bb.2: ## %bb52.i.i
-; CHECK-NEXT:    testb $1, %bl
 ; CHECK-NEXT:    je LBB0_25
 ; CHECK-NEXT:  ## %bb.3: ## %bb142.i
 ; CHECK-NEXT:    je LBB0_25
@@ -49,23 +48,23 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
 ; CHECK-NEXT:    jmp LBB0_5
 ; CHECK-NEXT:  LBB0_21: ## %bb7806
 ; CHECK-NEXT:    ## in Loop: Header=BB0_5 Depth=1
-; CHECK-NEXT:  Ltmp16:
+; CHECK-NEXT:  Ltmp16: ## EH_LABEL
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $1, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, (%esp)
 ; CHECK-NEXT:    calll __ZN12wxStringBase6appendEmw
-; CHECK-NEXT:  Ltmp17:
+; CHECK-NEXT:  Ltmp17: ## EH_LABEL
 ; CHECK-NEXT:  LBB0_5: ## %bb3261
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    cmpl $37, 0
 ; CHECK-NEXT:    jne LBB0_25
 ; CHECK-NEXT:  ## %bb.6: ## %bb3306
 ; CHECK-NEXT:    ## in Loop: Header=BB0_5 Depth=1
-; CHECK-NEXT:  Ltmp0:
+; CHECK-NEXT:  Ltmp0: ## EH_LABEL
 ; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, (%esp)
 ; CHECK-NEXT:    calll __ZN12wxStringBaseaSEPKw
-; CHECK-NEXT:  Ltmp1:
+; CHECK-NEXT:  Ltmp1: ## EH_LABEL
 ; CHECK-NEXT:  ## %bb.7: ## %bb3314
 ; CHECK-NEXT:    ## in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    movl 0, %eax
@@ -89,11 +88,11 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
 ; CHECK-NEXT:    je LBB0_14
 ; CHECK-NEXT:  ## %bb.13: ## %bb155.i8541
 ; CHECK-NEXT:    ## in Loop: Header=BB0_5 Depth=1
-; CHECK-NEXT:  Ltmp4:
+; CHECK-NEXT:  Ltmp4: ## EH_LABEL
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, (%esp)
 ; CHECK-NEXT:    calll _gmtime_r
-; CHECK-NEXT:  Ltmp5:
+; CHECK-NEXT:  Ltmp5: ## EH_LABEL
 ; CHECK-NEXT:  LBB0_14: ## %bb182.i8560
 ; CHECK-NEXT:    ## in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    testb $1, %bl
@@ -103,7 +102,7 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
 ; CHECK-NEXT:    je LBB0_18
 ; CHECK-NEXT:  ## %bb.17: ## %bb440.i8663
 ; CHECK-NEXT:    ## in Loop: Header=BB0_5 Depth=1
-; CHECK-NEXT:  Ltmp6:
+; CHECK-NEXT:  Ltmp6: ## EH_LABEL
 ; CHECK-NEXT:    movl L_.str4$non_lazy_ptr, %eax
 ; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl L_.str33$non_lazy_ptr, %eax
@@ -113,47 +112,47 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
 ; CHECK-NEXT:    movl %ebp, (%esp)
 ; CHECK-NEXT:    movl $1717, {{[0-9]+}}(%esp) ## imm = 0x6B5
 ; CHECK-NEXT:    calll __Z10wxOnAssertPKwiPKcS0_S0_
-; CHECK-NEXT:  Ltmp7:
+; CHECK-NEXT:  Ltmp7: ## EH_LABEL
 ; CHECK-NEXT:    jmp LBB0_18
 ; CHECK-NEXT:  LBB0_15: ## %bb187.i8591
 ; CHECK-NEXT:    ## in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    jne LBB0_25
 ; CHECK-NEXT:  LBB0_18: ## %invcont5814
 ; CHECK-NEXT:    ## in Loop: Header=BB0_5 Depth=1
-; CHECK-NEXT:  Ltmp8:
+; CHECK-NEXT:  Ltmp8: ## EH_LABEL
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, (%esp)
 ; CHECK-NEXT:    calll __ZN8wxString6FormatEPKwz
 ; CHECK-NEXT:    subl $4, %esp
-; CHECK-NEXT:  Ltmp9:
+; CHECK-NEXT:  Ltmp9: ## EH_LABEL
 ; CHECK-NEXT:  ## %bb.19: ## %invcont5831
 ; CHECK-NEXT:    ## in Loop: Header=BB0_5 Depth=1
-; CHECK-NEXT:  Ltmp10:
+; CHECK-NEXT:  Ltmp10: ## EH_LABEL
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, (%esp)
 ; CHECK-NEXT:    calll __ZN12wxStringBase10ConcatSelfEmPKwm
-; CHECK-NEXT:  Ltmp11:
+; CHECK-NEXT:  Ltmp11: ## EH_LABEL
 ; CHECK-NEXT:    jmp LBB0_5
 ; CHECK-NEXT:  LBB0_9: ## %bb5657
-; CHECK-NEXT:  Ltmp13:
+; CHECK-NEXT:  Ltmp13: ## EH_LABEL
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl %eax, (%esp)
 ; CHECK-NEXT:    calll __ZNK10wxDateTime12GetDayOfYearERKNS_8TimeZoneE
-; CHECK-NEXT:  Ltmp14:
+; CHECK-NEXT:  Ltmp14: ## EH_LABEL
 ; CHECK-NEXT:    jmp LBB0_25
 ; CHECK-NEXT:  LBB0_20: ## %bb5968
-; CHECK-NEXT:  Ltmp2:
+; CHECK-NEXT:  Ltmp2: ## EH_LABEL
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, (%esp)
 ; CHECK-NEXT:    calll __ZN8wxString6FormatEPKwz
 ; CHECK-NEXT:    subl $4, %esp
-; CHECK-NEXT:  Ltmp3:
+; CHECK-NEXT:  Ltmp3: ## EH_LABEL
 ; CHECK-NEXT:  LBB0_25: ## %bb115.critedge.i
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    addl $28, %esp
@@ -163,13 +162,13 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
 ; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    retl $4
 ; CHECK-NEXT:  LBB0_23: ## %lpad.loopexit.split-lp
-; CHECK-NEXT:  Ltmp15:
+; CHECK-NEXT:  Ltmp15: ## EH_LABEL
 ; CHECK-NEXT:    jmp LBB0_25
 ; CHECK-NEXT:  LBB0_24: ## %lpad8185
-; CHECK-NEXT:  Ltmp12:
+; CHECK-NEXT:  Ltmp12: ## EH_LABEL
 ; CHECK-NEXT:    jmp LBB0_25
 ; CHECK-NEXT:  LBB0_22: ## %lpad.loopexit
-; CHECK-NEXT:  Ltmp18:
+; CHECK-NEXT:  Ltmp18: ## EH_LABEL
 ; CHECK-NEXT:    jmp LBB0_25
 ; CHECK-NEXT:  Lfunc_end0:
 entry:
diff --git a/llvm/test/CodeGen/X86/GlobalISel/ptrtoaddr.ll b/llvm/test/CodeGen/X86/GlobalISel/ptrtoaddr.ll
new file mode 100644
index 0000000..f65d99d
--- /dev/null
+++ b/llvm/test/CodeGen/X86/GlobalISel/ptrtoaddr.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=CHECK
+
+define i1 @ptrtoaddr_1(ptr %p) {
+; CHECK-LABEL: ptrtoaddr_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    xorb $1, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $rax
+; CHECK-NEXT:    retq
+entry:
+  %addr = ptrtoaddr ptr %p to i64
+  %trunc = trunc i64 %addr to i1
+  %ret = xor i1 %trunc, 1
+  ret i1 %ret
+}
+
+define i8 @ptrtoaddr_8(ptr %p) {
+; CHECK-LABEL: ptrtoaddr_8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    notb %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $rax
+; CHECK-NEXT:    retq
+entry:
+  %addr = ptrtoaddr ptr %p to i64
+  %trunc = trunc i64 %addr to i8
+  %ret = xor i8 %trunc, -1
+  ret i8 %ret
+}
+
+define i16 @ptrtoaddr_16(ptr %p) {
+; CHECK-LABEL: ptrtoaddr_16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    notw %ax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $rax
+; CHECK-NEXT:    retq
+entry:
+  %addr = ptrtoaddr ptr %p to i64
+  %trunc = trunc i64 %addr to i16
+  %ret = xor i16 %trunc, -1
+  ret i16 %ret
+}
+
+define i32 @ptrtoaddr_32(ptr %p) {
+; CHECK-LABEL: ptrtoaddr_32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    notl %eax
+; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
+; CHECK-NEXT:    retq
+entry:
+  %addr = ptrtoaddr ptr %p to i64
+  %trunc = trunc i64 %addr to i32
+  %ret = xor i32 %trunc, -1
+  ret i32 %ret
+}
+
+define i64 @ptrtoaddr_64(ptr %p) {
+; CHECK-LABEL: ptrtoaddr_64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    notq %rax
+; CHECK-NEXT:    retq
+entry:
+  %addr = ptrtoaddr ptr %p to i64
+  %ret = xor i64 %addr, -1
+  ret i64 %ret
+}
+
+define i128 @ptrtoaddr_128(ptr %p) {
+; CHECK-LABEL: ptrtoaddr_128:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    notq %rax
+; CHECK-NEXT:    notq %rdx
+; CHECK-NEXT:    retq
+entry:
+  %addr = ptrtoaddr ptr %p to i64
+  %ext = zext i64 %addr to i128
+  %ret = xor i128 %ext, -1
+  ret i128 %ret
+}
+
+; TODO: Vector version cannot be handled by GlobalIsel yet (same error as ptrtoint: https://github.com/llvm/llvm-project/issues/150875).
+; define <2 x i64> @ptrtoaddr_vec(<2 x ptr> %p) {
+; entry:
+;  %addr = ptrtoaddr <2 x ptr> %p to <2 x i64>
+;  %ret = xor <2 x i64> %addr, <i64 -1, i64 -1>
+;  ret <2 x i64> %ret
+;}
+
+; UTC_ARGS: --disable
+
+@foo = global [16 x i8] zeroinitializer
+@addr = global i64 ptrtoaddr (ptr @foo to i64)
+; CHECK:      addr:
+; CHECK-NEXT:  .quad	foo
+; CHECK-NEXT:  .size	addr, 8
+@addr_plus_one = global i64 ptrtoaddr (ptr getelementptr (i8, ptr @foo, i64 1) to i64)
+; CHECK:      addr_plus_one:
+; CHECK-NEXT:  .quad	foo+1
+; CHECK-NEXT:  .size	addr_plus_one, 8
+@const_addr = global i64 ptrtoaddr (ptr getelementptr (i8, ptr null, i64 1) to i64)
+; CHECK:      const_addr:
+; CHECK-NEXT:  .quad	0+1
+; CHECK-NEXT:  .size	const_addr, 8
diff --git a/llvm/test/CodeGen/X86/PR37310.mir b/llvm/test/CodeGen/X86/PR37310.mir
index 6c68f79..6a7f5bd 100644
--- a/llvm/test/CodeGen/X86/PR37310.mir
+++ b/llvm/test/CodeGen/X86/PR37310.mir
@@ -97,8 +97,8 @@ frameInfo:
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      
 stack:           
   - { id: 0, name: q, type: default, offset: 0, size: 512, alignment: 16, 
diff --git a/llvm/test/CodeGen/X86/StackColoring-use-between-allocas.mir b/llvm/test/CodeGen/X86/StackColoring-use-between-allocas.mir
index b820541..a992222 100644
--- a/llvm/test/CodeGen/X86/StackColoring-use-between-allocas.mir
+++ b/llvm/test/CodeGen/X86/StackColoring-use-between-allocas.mir
@@ -118,8 +118,8 @@ frameInfo:
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:
   - { id: 0, name: a, type: default, offset: 0, size: 1, alignment: 8,
diff --git a/llvm/test/CodeGen/X86/align-basic-block-sections.mir b/llvm/test/CodeGen/X86/align-basic-block-sections.mir
index 02ccbcf..1bf39ec 100644
--- a/llvm/test/CodeGen/X86/align-basic-block-sections.mir
+++ b/llvm/test/CodeGen/X86/align-basic-block-sections.mir
@@ -83,8 +83,8 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   hasTailCall:     false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:           []
 callSites:       []
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir
index 7f862db..ab12ab3 100644
--- a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir
+++ b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir
@@ -43,8 +43,8 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   hasTailCall:     false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:
   - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 8,
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir
index 3a03fbe..c7d241f 100644
--- a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir
+++ b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir
@@ -93,8 +93,8 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   hasTailCall:     false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:
   - { id: 0, name: '', type: default, offset: 0, size: 64, alignment: 4,
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir b/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir
index a19ff2a..66b15aa 100644
--- a/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir
+++ b/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir
@@ -46,8 +46,8 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   hasTailCall:     false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:
   - { id: 43, name: '', type: default, offset: 0, size: 64, alignment: 4,
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir
index e3f15c6..1e3b242 100644
--- a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir
+++ b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir
@@ -59,8 +59,8 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   hasTailCall:     false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:
   - { id: 18, name: '', type: default, offset: 0, size: 8, alignment: 8,
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir
index 89f6773..ac2cdb4 100644
--- a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir
+++ b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir
@@ -57,8 +57,8 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   hasTailCall:     false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:           []
 callSites:       []
diff --git a/llvm/test/CodeGen/X86/apx/cf.ll b/llvm/test/CodeGen/X86/apx/cf.ll
index e52ce6c..b2651e9 100644
--- a/llvm/test/CodeGen/X86/apx/cf.ll
+++ b/llvm/test/CodeGen/X86/apx/cf.ll
@@ -229,3 +229,21 @@ entry:
   call void @llvm.masked.store.v1i32.p0(<1 x i32> zeroinitializer, ptr %p, i32 1, <1 x i1> %1)
   ret void
 }
+
+define i64 @redundant_test(i64 %num, ptr %p1, i64 %in) {
+; CHECK-LABEL: redundant_test:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testl $-32, %edi
+; CHECK-NEXT:    cfcmoveq (%rsi), %rax
+; CHECK-NEXT:    {nf} addq %rdx, %rax
+; CHECK-NEXT:    cmovneq %rdi, %rax
+; CHECK-NEXT:    retq
+  %and = and i64 %num, 4294967264
+  %cmp = icmp eq i64 %and, 0
+  %mask = bitcast i1 %cmp to <1 x i1>
+  %condload = tail call <1 x i64> @llvm.masked.load.v1i64.p0(ptr %p1, i32 8, <1 x i1> %mask, <1 x i64> poison)
+  %v = bitcast <1 x i64> %condload to i64
+  %add = add i64 %v, %in
+  %sel = select i1 %cmp, i64 %add, i64 %num
+  ret i64 %sel
+}
diff --git a/llvm/test/CodeGen/X86/apx/domain-reassignment.mir b/llvm/test/CodeGen/X86/apx/domain-reassignment.mir
index e478e66..0f13a5b 100644
--- a/llvm/test/CodeGen/X86/apx/domain-reassignment.mir
+++ b/llvm/test/CodeGen/X86/apx/domain-reassignment.mir
@@ -109,8 +109,8 @@ frameInfo:
   hasOpaqueSPAdjustment: false
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:
 stack:
 constants:
@@ -255,8 +255,8 @@ frameInfo:
   hasOpaqueSPAdjustment: false
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:
 stack:
 constants:
@@ -375,8 +375,8 @@ frameInfo:
   hasOpaqueSPAdjustment: false
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:
 stack:
 constants:
@@ -487,8 +487,8 @@ frameInfo:
   hasOpaqueSPAdjustment: false
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:
 stack:
 constants:
@@ -590,8 +590,8 @@ frameInfo:
   hasOpaqueSPAdjustment: false
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:
 stack:
 constants:
@@ -686,8 +686,8 @@ frameInfo:
   hasOpaqueSPAdjustment: false
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:
 stack:
 constants:
@@ -756,8 +756,8 @@ frameInfo:
   hasOpaqueSPAdjustment: false
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:
 stack:
 constants:
@@ -829,8 +829,8 @@ frameInfo:
   hasOpaqueSPAdjustment: false
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:
 stack:
 constants:
@@ -911,8 +911,8 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   hasTailCall:     false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:           []
 entry_values:    []
diff --git a/llvm/test/CodeGen/X86/apx/memfold-nd2rmw.mir b/llvm/test/CodeGen/X86/apx/memfold-nd2rmw.mir
index 54e1406..7b4945b 100644
--- a/llvm/test/CodeGen/X86/apx/memfold-nd2rmw.mir
+++ b/llvm/test/CodeGen/X86/apx/memfold-nd2rmw.mir
@@ -125,8 +125,8 @@ frameInfo:
   hasTailCall:     false
   isCalleeSavedInfoValid: false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:
   - { id: 0, type: default, offset: 0, size: 1, alignment: 16, stack-id: default,
       isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
diff --git a/llvm/test/CodeGen/X86/attr-dontcall.ll b/llvm/test/CodeGen/X86/attr-dontcall.ll
index 7de44b8..9024280 100644
--- a/llvm/test/CodeGen/X86/attr-dontcall.ll
+++ b/llvm/test/CodeGen/X86/attr-dontcall.ll
@@ -1,6 +1,6 @@
 ; RUN: not llc -mtriple=x86_64 -global-isel=0 -fast-isel=0 -stop-after=finalize-isel < %s 2>&1 | FileCheck %s
 ; RUN: not llc -mtriple=x86_64 -global-isel=0 -fast-isel=1 -stop-after=finalize-isel < %s 2>&1 | FileCheck %s
-; RUN: not llc -mtriple=x86_64 -global-isel=1 -fast-isel=0 -stop-after=irtranslator -global-isel-abort=0 < %s 2>&1 | FileCheck %s
+; RUN: not llc -mtriple=x86_64 -global-isel=1 -fast-isel=0 -global-isel-abort=0 < %s 2>&1 | FileCheck %s
 
 declare void @foo() "dontcall-error"="e"
 define void @bar() {
diff --git a/llvm/test/CodeGen/X86/attr-function-return.mir b/llvm/test/CodeGen/X86/attr-function-return.mir
index 91c03e8..96db27b 100644
--- a/llvm/test/CodeGen/X86/attr-function-return.mir
+++ b/llvm/test/CodeGen/X86/attr-function-return.mir
@@ -45,8 +45,8 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   hasTailCall:     false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:           []
 callSites:       []
diff --git a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir
index 9329299..99fee27 100644
--- a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir
+++ b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir
@@ -127,8 +127,8 @@ frameInfo:
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:           []
 constants:       []
@@ -185,8 +185,8 @@ frameInfo:
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:           []
 constants:       []
diff --git a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change2.mir b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change2.mir
index 4da4f03..50b2433 100644
--- a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change2.mir
+++ b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change2.mir
@@ -138,8 +138,8 @@ frameInfo:
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:           []
 constants:       []
diff --git a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change3.mir b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change3.mir
index 19e3f38..7a4b993 100644
--- a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change3.mir
+++ b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change3.mir
@@ -154,8 +154,8 @@ frameInfo:
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:           []
 constants:       []
diff --git a/llvm/test/CodeGen/X86/avoid-sfb-offset.mir b/llvm/test/CodeGen/X86/avoid-sfb-offset.mir
index 978b3e7..eec19e7 100644
--- a/llvm/test/CodeGen/X86/avoid-sfb-offset.mir
+++ b/llvm/test/CodeGen/X86/avoid-sfb-offset.mir
@@ -67,8 +67,8 @@ frameInfo:
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:
 stack:
   - { id: 0, name: a, type: default, offset: 0, size: 144, alignment: 16,
diff --git a/llvm/test/CodeGen/X86/avx-load-store.ll b/llvm/test/CodeGen/X86/avx-load-store.ll
index 3f856d3..cc2cedb 100644
--- a/llvm/test/CodeGen/X86/avx-load-store.ll
+++ b/llvm/test/CodeGen/X86/avx-load-store.ll
@@ -34,7 +34,7 @@ define void @test_256_load(ptr nocapture %d, ptr nocapture %f, ptr nocapture %i)
 ;
 ; CHECK_O0-LABEL: test_256_load:
 ; CHECK_O0:       # %bb.0: # %entry
-; CHECK_O0-NEXT:    subq $184, %rsp
+; CHECK_O0-NEXT:    subq $136, %rsp
 ; CHECK_O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK_O0-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK_O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -54,7 +54,7 @@ define void @test_256_load(ptr nocapture %d, ptr nocapture %f, ptr nocapture %i)
 ; CHECK_O0-NEXT:    vmovapd %ymm2, (%rdi)
 ; CHECK_O0-NEXT:    vmovaps %ymm1, (%rsi)
 ; CHECK_O0-NEXT:    vmovdqa %ymm0, (%rdx)
-; CHECK_O0-NEXT:    addq $184, %rsp
+; CHECK_O0-NEXT:    addq $136, %rsp
 ; CHECK_O0-NEXT:    vzeroupper
 ; CHECK_O0-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/avx512f-256-set0.mir b/llvm/test/CodeGen/X86/avx512f-256-set0.mir
index 3915599..a77c724 100644
--- a/llvm/test/CodeGen/X86/avx512f-256-set0.mir
+++ b/llvm/test/CodeGen/X86/avx512f-256-set0.mir
@@ -52,8 +52,8 @@ frameInfo:
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:           []
 constants:       []
diff --git a/llvm/test/CodeGen/X86/avx512f-large-stack.ll b/llvm/test/CodeGen/X86/avx512f-large-stack.ll
new file mode 100644
index 0000000..3cb5391c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512f-large-stack.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp --version 4
+; RUN: llc -O0 -mtriple=x86_64 -mattr=+avx512f -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK
+define void @f(i16 %LGV2, i1 %LGV3) {
+; CHECK-LABEL: f:
+; CHECK:       # %bb.0: # %BB
+; CHECK-NEXT:    subq $2147483528, %rsp # imm = 0x7FFFFF88
+; CHECK-NEXT:    .cfi_def_cfa_offset 2147483536
+; CHECK-NEXT:    movb %sil, %cl
+; CHECK-NEXT:    movw %di, %ax
+; CHECK-NEXT:    movswq %ax, %rax
+; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    movabsq $-2147483768, %rdx # imm = 0xFFFFFFFF7FFFFF88
+; CHECK-NEXT:    movb %cl, (%rsp,%rdx)
+; CHECK-NEXT:    addq $2147483528, %rsp # imm = 0x7FFFFF88
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+BB:
+  %A = alloca i1, i33 2147483648, align 1
+  %G = getelementptr i1, ptr %A, i16 %LGV2
+  %G4 = getelementptr i1, ptr %G, i32 -2147483648
+  store i1 %LGV3, ptr %G4, align 1
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-mir-parse.mir b/llvm/test/CodeGen/X86/basic-block-address-map-mir-parse.mir
index a49a4e2..34f40b2 100644
--- a/llvm/test/CodeGen/X86/basic-block-address-map-mir-parse.mir
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-mir-parse.mir
@@ -93,8 +93,8 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   hasTailCall:     false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:
   - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16, stack-id: default, 
       callee-saved-register: '', callee-saved-restored: true, debug-info-variable: '', 
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-mir-parse.mir b/llvm/test/CodeGen/X86/basic-block-sections-mir-parse.mir
index e49ff14..b71ccc5 100644
--- a/llvm/test/CodeGen/X86/basic-block-sections-mir-parse.mir
+++ b/llvm/test/CodeGen/X86/basic-block-sections-mir-parse.mir
@@ -70,8 +70,8 @@ frameInfo:
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:
   - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16, stack-id: default, 
       callee-saved-register: '', callee-saved-restored: true, debug-info-variable: '', 
diff --git a/llvm/test/CodeGen/X86/bitcnt-false-dep.ll b/llvm/test/CodeGen/X86/bitcnt-false-dep.ll
index 5f576c8..793cbb8 100644
--- a/llvm/test/CodeGen/X86/bitcnt-false-dep.ll
+++ b/llvm/test/CodeGen/X86/bitcnt-false-dep.ll
@@ -1,6 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=haswell | FileCheck %s --check-prefix=HSW
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake | FileCheck %s --check-prefix=SKL
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skx | FileCheck %s --check-prefix=SKL
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=alderlake | FileCheck %s --check-prefix=ADL
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=silvermont -mattr=+lzcnt,+bmi | FileCheck %s --check-prefix=SKL
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=goldmont -mattr=+lzcnt,+bmi | FileCheck %s --check-prefix=SKL
 
@@ -37,6 +38,10 @@ ret:
 ;SKL-LABEL:@loopdep_popcnt32
 ;SKL: xorl [[GPR0:%e[a-d]x]], [[GPR0]]
 ;SKL-NEXT: popcntl {{.*}}, [[GPR0]]
+
+;ADL-LABEL:@loopdep_popcnt32
+;ADL-NOT: xor
+;ADL: popcntl
 }
 
 define i64 @loopdep_popcnt64(ptr nocapture %x, ptr nocapture %y) nounwind {
@@ -63,6 +68,10 @@ ret:
 ;SKL-LABEL:@loopdep_popcnt64
 ;SKL: xorl %e[[GPR0:[a-d]x]], %e[[GPR0]]
 ;SKL-NEXT: popcntq {{.*}}, %r[[GPR0]]
+
+;ADL-LABEL:@loopdep_popcnt64
+;ADL-NOT: xor
+;ADL: popcntq
 }
 
 define i32 @loopdep_tzct32(ptr nocapture %x, ptr nocapture %y) nounwind {
diff --git a/llvm/test/CodeGen/X86/break-false-dep-crash.mir b/llvm/test/CodeGen/X86/break-false-dep-crash.mir
index 713633c..febc49d 100644
--- a/llvm/test/CodeGen/X86/break-false-dep-crash.mir
+++ b/llvm/test/CodeGen/X86/break-false-dep-crash.mir
@@ -93,8 +93,8 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   hasTailCall:     false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:           []
 callSites:       []
diff --git a/llvm/test/CodeGen/X86/callbr-asm-loop.ll b/llvm/test/CodeGen/X86/callbr-asm-loop.ll
new file mode 100644
index 0000000..0b68988
--- /dev/null
+++ b/llvm/test/CodeGen/X86/callbr-asm-loop.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc -O1 -mtriple=i686-- < %s | FileCheck %s
+
+; Test that causes multiple defs of %eax.
+define i32 @loop1() nounwind {
+; CHECK-LABEL: loop1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_1: # %tailrecurse
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movl $1, %edx
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jmp .LBB0_1
+; CHECK-NEXT:  .LBB0_2: # Inline asm indirect target
+; CHECK-NEXT:    # %tailrecurse.tailrecurse_crit_edge
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    # Label of block must be emitted
+; CHECK-NEXT:    jmp .LBB0_1
+; CHECK-NEXT:  .LBB0_3: # Inline asm indirect target
+; CHECK-NEXT:    # %lab2.split
+; CHECK-NEXT:    # Label of block must be emitted
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    retl
+entry:
+  br label %tailrecurse
+
+tailrecurse:
+  %0 = callbr { i32, i32 } asm "", "={ax},={dx},0,1,!i,!i"(i32 0, i32 1) #1
+          to label %tailrecurse.backedge [label %tailrecurse.backedge, label %lab2.split]
+
+tailrecurse.backedge:
+  br label %tailrecurse
+
+lab2.split:
+  %asmresult5 = extractvalue { i32, i32 } %0, 1
+  ret i32 %asmresult5
+}
diff --git a/llvm/test/CodeGen/X86/callbr-asm-outputs-regallocfast.mir b/llvm/test/CodeGen/X86/callbr-asm-outputs-regallocfast.mir
index 1923114..f5af0ad 100644
--- a/llvm/test/CodeGen/X86/callbr-asm-outputs-regallocfast.mir
+++ b/llvm/test/CodeGen/X86/callbr-asm-outputs-regallocfast.mir
@@ -96,8 +96,8 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   hasTailCall:     false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:
   - { id: 0, name: retval, type: default, offset: 0, size: 4, alignment: 4,
diff --git a/llvm/test/CodeGen/X86/cf-opt-memops.mir b/llvm/test/CodeGen/X86/cf-opt-memops.mir
index 44dead8..86719a2 100644
--- a/llvm/test/CodeGen/X86/cf-opt-memops.mir
+++ b/llvm/test/CodeGen/X86/cf-opt-memops.mir
@@ -68,8 +68,8 @@ frameInfo:
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:           []
 callSites:       []
diff --git a/llvm/test/CodeGen/X86/cfi-epilogue-with-return.mir b/llvm/test/CodeGen/X86/cfi-epilogue-with-return.mir
index 583e54b..4f80f4f 100644
--- a/llvm/test/CodeGen/X86/cfi-epilogue-with-return.mir
+++ b/llvm/test/CodeGen/X86/cfi-epilogue-with-return.mir
@@ -21,8 +21,10 @@ liveins:
 frameInfo:
   maxAlignment:    1
   hasCalls:        true
-  savePoint:       '%bb.1'
-  restorePoint:    '%bb.1'
+  savePoint:
+    - point:           '%bb.1'
+  restorePoint:
+    - point:           '%bb.1'
 machineFunctionInfo: {}
 body:             |
   bb.0:
diff --git a/llvm/test/CodeGen/X86/cfi-epilogue-without-return.mir b/llvm/test/CodeGen/X86/cfi-epilogue-without-return.mir
index 8f04721..38c081c 100644
--- a/llvm/test/CodeGen/X86/cfi-epilogue-without-return.mir
+++ b/llvm/test/CodeGen/X86/cfi-epilogue-without-return.mir
@@ -28,8 +28,10 @@ liveins:
 frameInfo:
   maxAlignment:    1
   hasCalls:        true
-  savePoint:       '%bb.1'
-  restorePoint:    '%bb.1'
+  savePoint:
+    - point:           '%bb.1'
+  restorePoint:
+    - point:           '%bb.1'
 machineFunctionInfo: {}
 body:             |
   bb.0:
diff --git a/llvm/test/CodeGen/X86/combine-gfni.ll b/llvm/test/CodeGen/X86/combine-gfni.ll
new file mode 100644
index 0000000..b105cdf
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-gfni.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+gfni | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+gfni,+avx | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+gfni,+avx512bw | FileCheck %s --check-prefixes=AVX512
+
+define <16 x i8> @gf2p8affineqb_freeze(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
+; SSE-LABEL: gf2p8affineqb_freeze:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm3, %xmm3
+; SSE-NEXT:    pcmpgtb %xmm2, %xmm3
+; SSE-NEXT:    gf2p8affineqb $11, %xmm1, %xmm1
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pandn %xmm0, %xmm3
+; SSE-NEXT:    por %xmm1, %xmm3
+; SSE-NEXT:    movdqa %xmm3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: gf2p8affineqb_freeze:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vgf2p8affineqb $11, %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: gf2p8affineqb_freeze:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovb2m %xmm2, %k1
+; AVX512-NEXT:    vgf2p8affineqb $11, %xmm1, %xmm1, %xmm0 {%k1}
+; AVX512-NEXT:    retq
+  %i = icmp slt <16 x i8> %a2, zeroinitializer
+  %g = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %a1, <16 x i8> %a1, i8 11)
+  %f = freeze <16 x i8> %g
+  %r = select <16 x i1> %i, <16 x i8> %f, <16 x i8> %a0
+  ret <16 x i8> %r
+}
+
+define <16 x i8> @gf2p8affineinvqb_freeze(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
+; SSE-LABEL: gf2p8affineinvqb_freeze:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm3, %xmm3
+; SSE-NEXT:    pcmpgtb %xmm2, %xmm3
+; SSE-NEXT:    gf2p8affineinvqb $11, %xmm1, %xmm1
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pandn %xmm0, %xmm3
+; SSE-NEXT:    por %xmm1, %xmm3
+; SSE-NEXT:    movdqa %xmm3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: gf2p8affineinvqb_freeze:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vgf2p8affineinvqb $11, %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: gf2p8affineinvqb_freeze:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovb2m %xmm2, %k1
+; AVX512-NEXT:    vgf2p8affineinvqb $11, %xmm1, %xmm1, %xmm0 {%k1}
+; AVX512-NEXT:    retq
+  %i = icmp slt <16 x i8> %a2, zeroinitializer
+  %g = call <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8> %a1, <16 x i8> %a1, i8 11)
+  %f = freeze <16 x i8> %g
+  %r = select <16 x i1> %i, <16 x i8> %f, <16 x i8> %a0
+  ret <16 x i8> %r
+}
+
+define <16 x i8> @gf2p8mulb_freeze(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
+; SSE-LABEL: gf2p8mulb_freeze:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm3, %xmm3
+; SSE-NEXT:    pcmpgtb %xmm2, %xmm3
+; SSE-NEXT:    gf2p8mulb %xmm1, %xmm1
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pandn %xmm0, %xmm3
+; SSE-NEXT:    por %xmm1, %xmm3
+; SSE-NEXT:    movdqa %xmm3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: gf2p8mulb_freeze:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vgf2p8mulb %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: gf2p8mulb_freeze:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovb2m %xmm2, %k1
+; AVX512-NEXT:    vgf2p8mulb %xmm1, %xmm1, %xmm0 {%k1}
+; AVX512-NEXT:    retq
+  %i = icmp slt <16 x i8> %a2, zeroinitializer
+  %g = call <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8> %a1, <16 x i8> %a1)
+  %f = freeze <16 x i8> %g
+  %r = select <16 x i1> %i, <16 x i8> %f, <16 x i8> %a0
+  ret <16 x i8> %r
+}
+
+declare <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8)
+declare <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8)
+declare <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8>, <16 x i8>, i8)
+declare <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8>, <32 x i8>, i8)
+declare <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8>, <16 x i8>)
+declare <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8>, <32 x i8>)
diff --git a/llvm/test/CodeGen/X86/combine-movmsk.ll b/llvm/test/CodeGen/X86/combine-movmsk.ll
index baa0553..c107a98 100644
--- a/llvm/test/CodeGen/X86/combine-movmsk.ll
+++ b/llvm/test/CodeGen/X86/combine-movmsk.ll
@@ -489,6 +489,31 @@ define i32 @or_pmovmskb_pmovmskb(<16 x i8> %a0, <8 x i16> %a1) {
   ret i32 %7
 }
 
+; FREEZE(MOVMSK(X)) -> MOVMSK(FREEZE(X))
+define i32 @movmskps_freeze(<4 x i32> %a0) {
+; SSE-LABEL: movmskps_freeze:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movmskps %xmm0, %eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: movmskps_freeze:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovmskps %xmm0, %eax
+; AVX-NEXT:    retq
+;
+; ADL-LABEL: movmskps_freeze:
+; ADL:       # %bb.0:
+; ADL-NEXT:    vmovmskps %xmm0, %eax
+; ADL-NEXT:    retq
+  %1 = icmp slt <4 x i32> %a0, zeroinitializer
+  %2 = sext <4 x i1> %1 to <4 x i32>
+  %3 = bitcast <4 x i32> %2 to <4 x float>
+  %4 = tail call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %3)
+  %5 = freeze i32 %4
+  %6 = and i32 %5, 15
+  ret i32 %6
+}
+
 ; We can't fold to ptest if we're not checking every pcmpeq result
 define i32 @movmskps_ptest_numelts_mismatch(<16 x i8> %a0) {
 ; SSE-LABEL: movmskps_ptest_numelts_mismatch:
diff --git a/llvm/test/CodeGen/X86/combine-vpmadd52.ll b/llvm/test/CodeGen/X86/combine-vpmadd52.ll
new file mode 100644
index 0000000..fd295ea
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-vpmadd52.ll
@@ -0,0 +1,185 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512ifma,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxifma | FileCheck %s --check-prefixes=CHECK,AVX
+
+define <2 x i64> @test1_vpmadd52l(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+; AVX512-LABEL: test1_vpmadd52l:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX-LABEL: test1_vpmadd52l:
+; AVX:       # %bb.0:
+; AVX-NEXT:    {vex} vpmadd52luq %xmm2, %xmm1, %xmm0
+; AVX-NEXT:    retq
+
+  %and = and <2 x i64> %x1, splat (i64 4503599627370495) ; (1LL << 52) - 1
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %and, <2 x i64> %x2)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test2_vpmadd52l(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+; AVX512-LABEL: test2_vpmadd52l:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX-LABEL: test2_vpmadd52l:
+; AVX:       # %bb.0:
+; AVX-NEXT:    {vex} vpmadd52luq %xmm2, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %and = and <2 x i64> %x2, splat (i64 4503599627370495) ; (1LL << 52) - 1
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %and)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test3_vpmadd52l(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+; AVX512-LABEL: test3_vpmadd52l:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX-LABEL: test3_vpmadd52l:
+; AVX:       # %bb.0:
+; AVX-NEXT:    {vex} vpmadd52luq %xmm2, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %and = and <2 x i64> %x1, splat (i64 4503599627370495) ; (1LL << 52) - 1
+  %or = or <2 x i64> %x2, splat (i64 4503599627370496) ; 1LL << 52
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %and, <2 x i64> %or)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_vpmadd52l_wrong_bits(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+; AVX512-LABEL: test_vpmadd52l_wrong_bits:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1
+; AVX512-NEXT:    vporq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm2
+; AVX512-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX-LABEL: test_vpmadd52l_wrong_bits:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    {vex} vpmadd52luq %xmm2, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %and = and <2 x i64> %x1, splat (i64 2251799813685247) ; (1LL << 51) - 1
+  %or = or <2 x i64> %x2, splat (i64 2251799813685248) ; 1LL << 51
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %and, <2 x i64> %or)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_vpmadd52l_wrong_op(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+; AVX512-LABEL: test_vpmadd52l_wrong_op:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0
+; AVX512-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX-LABEL: test_vpmadd52l_wrong_op:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
+; AVX-NEXT:    {vex} vpmadd52luq %xmm2, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %and = and <2 x i64> %x1, splat (i64 4503599627370495) ; (1LL << 52) - 1
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %and, <2 x i64> %x1, <2 x i64> %x2)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_vpmadd52h(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+; AVX512-LABEL: test_vpmadd52h:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX-LABEL: test_vpmadd52h:
+; AVX:       # %bb.0:
+; AVX-NEXT:    {vex} vpmadd52huq %xmm2, %xmm1, %xmm0
+; AVX-NEXT:    retq
+
+  %and = and <2 x i64> %x1, splat (i64 4503599627370495) ; (1LL << 52) - 1
+  %or = or <2 x i64> %x2, splat (i64 4503599627370496) ; 1LL << 52
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %and, <2 x i64> %or)
+  ret <2 x i64> %1
+}
+
+; Test the fold x * 0 + y -> y
+define <2 x i64> @test_vpmadd52l_mul_zero(<2 x i64> %x0, <2 x i64> %x1) {
+; CHECK-LABEL: test_vpmadd52l_mul_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
+
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> <i64 0, i64 0>, <2 x i64> %x1)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_vpmadd52h_mul_zero(<2 x i64> %x0, <2 x i64> %x1) {
+; CHECK-LABEL: test_vpmadd52h_mul_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
+
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> <i64 0, i64 0>, <2 x i64> %x1)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_vpmadd52l_mul_zero_commuted(<2 x i64> %x0, <2 x i64> %x1) {
+; CHECK-LABEL: test_vpmadd52l_mul_zero_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
+
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> <i64 0, i64 0>)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_vpmadd52l_mul_zero_both(<2 x i64> %x0) {
+; CHECK-LABEL: test_vpmadd52l_mul_zero_both:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
+
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> <i64 0, i64 0>, <2 x i64> <i64 0, i64 0>)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_vpmadd52l_mul_zero_in_52bits(<2 x i64> %x0, <2 x i64> %x1) {
+; CHECK-LABEL: test_vpmadd52l_mul_zero_in_52bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
+
+  ; mul by (1 << 52)
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> splat (i64 4503599627370496), <2 x i64> %x1)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_vpmadd52l_add_zero(<2 x i64> %x0, <2 x i64> %x1) {
+; AVX512-LABEL: test_vpmadd52l_add_zero:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpmadd52luq %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vmovdqa %xmm2, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX-LABEL: test_vpmadd52l_add_zero:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    {vex} vpmadd52luq %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vmovdqa %xmm2, %xmm0
+; AVX-NEXT:    retq
+
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> <i64 0, i64 0>, <2 x i64> %x0, <2 x i64> %x1)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_vpmadd52l_mul_zero_scalar(<2 x i64> %x0, <2 x i64> %x1) {
+; AVX512-LABEL: test_vpmadd52l_mul_zero_scalar:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmadd52luq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX-LABEL: test_vpmadd52l_mul_zero_scalar:
+; AVX:       # %bb.0:
+; AVX-NEXT:    {vex} vpmadd52luq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
+; AVX-NEXT:    retq
+
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> <i64 0, i64 123>, <2 x i64> %x1)
+  ret <2 x i64> %1
+}
diff --git a/llvm/test/CodeGen/X86/conditional-tailcall-samedest.mir b/llvm/test/CodeGen/X86/conditional-tailcall-samedest.mir
index e2f5b67f..8257266 100644
--- a/llvm/test/CodeGen/X86/conditional-tailcall-samedest.mir
+++ b/llvm/test/CodeGen/X86/conditional-tailcall-samedest.mir
@@ -94,8 +94,8 @@ frameInfo:
   hasOpaqueSPAdjustment: false
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:
 stack:
 constants:
diff --git a/llvm/test/CodeGen/X86/cse-two-preds.mir b/llvm/test/CodeGen/X86/cse-two-preds.mir
index c821b3b..94dc1c4 100644
--- a/llvm/test/CodeGen/X86/cse-two-preds.mir
+++ b/llvm/test/CodeGen/X86/cse-two-preds.mir
@@ -85,8 +85,8 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   hasTailCall:     false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:           []
 callSites:       []
diff --git a/llvm/test/CodeGen/X86/domain-reassignment.mir b/llvm/test/CodeGen/X86/domain-reassignment.mir
index 1889982..18c5057 100644
--- a/llvm/test/CodeGen/X86/domain-reassignment.mir
+++ b/llvm/test/CodeGen/X86/domain-reassignment.mir
@@ -109,8 +109,8 @@ frameInfo:
   hasOpaqueSPAdjustment: false
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:
 stack:
 constants:
@@ -255,8 +255,8 @@ frameInfo:
   hasOpaqueSPAdjustment: false
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:
 stack:
 constants:
@@ -375,8 +375,8 @@ frameInfo:
   hasOpaqueSPAdjustment: false
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:
 stack:
 constants:
@@ -487,8 +487,8 @@ frameInfo:
   hasOpaqueSPAdjustment: false
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:
 stack:
 constants:
@@ -590,8 +590,8 @@ frameInfo:
   hasOpaqueSPAdjustment: false
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:
 stack:
 constants:
@@ -686,8 +686,8 @@ frameInfo:
   hasOpaqueSPAdjustment: false
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:
 stack:
 constants:
@@ -756,8 +756,8 @@ frameInfo:
   hasOpaqueSPAdjustment: false
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:
 stack:
 constants:
@@ -829,8 +829,8 @@ frameInfo:
   hasOpaqueSPAdjustment: false
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:
 stack:
 constants:
@@ -911,8 +911,8 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   hasTailCall:     false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:           []
 entry_values:    []
diff --git a/llvm/test/CodeGen/X86/early-clobber.mir b/llvm/test/CodeGen/X86/early-clobber.mir
new file mode 100644
index 0000000..4293315
--- /dev/null
+++ b/llvm/test/CodeGen/X86/early-clobber.mir
@@ -0,0 +1,76 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=i386-unknown-linux-gnu -start-before=twoaddressinstruction -stop-after=postrapseudos -verify-machineinstrs -o - %s | FileCheck %s
+
+# Test register live range that is split from rematerializing. The live range should
+# start with Slot_EarlyClobber instead of Slot_Register. Machineverifer can check it.
+
+--- |
+
+  define void @test() nounwind {
+    ret void
+  }
+
+...
+---
+name:            test
+alignment:       16
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    4
+fixedStack:
+  - { id: 0, size: 4, alignment: 16, isImmutable: true }
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: test
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT:   liveins: $ebp, $ebx, $edi, $esi
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   frame-setup PUSH32r killed $ebp, implicit-def $esp, implicit $esp
+  ; CHECK-NEXT:   frame-setup PUSH32r killed $ebx, implicit-def $esp, implicit $esp
+  ; CHECK-NEXT:   frame-setup PUSH32r killed $edi, implicit-def $esp, implicit $esp
+  ; CHECK-NEXT:   frame-setup PUSH32r killed $esi, implicit-def $esp, implicit $esp
+  ; CHECK-NEXT:   $esp = frame-setup SUB32ri $esp, 12, implicit-def dead $eflags
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $ebp, 12 /* clobber */, implicit-def dead early-clobber $eax, 12 /* clobber */, implicit-def dead early-clobber $ebx, 12 /* clobber */, implicit-def dead early-clobber $ecx, 12 /* clobber */, implicit-def dead early-clobber $edx, 12 /* clobber */, implicit-def dead early-clobber $esi, 12 /* clobber */, implicit-def dead early-clobber $edi, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags
+  ; CHECK-NEXT:   early-clobber renamable $eax = XOR32rr undef $eax, undef $eax, implicit-def dead $eflags
+  ; CHECK-NEXT:   TEST8rr renamable $al, renamable $al, implicit-def $eflags
+  ; CHECK-NEXT:   JCC_1 %bb.2, 5, implicit $eflags
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $eax = MOV32ri 1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   liveins: $eax
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $esp = frame-destroy ADD32ri $esp, 12, implicit-def dead $eflags
+  ; CHECK-NEXT:   $esi = frame-destroy POP32r implicit-def $esp, implicit $esp
+  ; CHECK-NEXT:   $edi = frame-destroy POP32r implicit-def $esp, implicit $esp
+  ; CHECK-NEXT:   $ebx = frame-destroy POP32r implicit-def $esp, implicit $esp
+  ; CHECK-NEXT:   $ebp = frame-destroy POP32r implicit-def $esp, implicit $esp
+  ; CHECK-NEXT:   RET 0, $eax
+  bb.0:
+    successors: %bb.1, %bb.2
+
+    early-clobber %0:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $ebp, 12 /* clobber */, implicit-def dead early-clobber $eax, 12 /* clobber */, implicit-def dead early-clobber $ebx, 12 /* clobber */, implicit-def dead early-clobber $ecx, 12 /* clobber */, implicit-def dead early-clobber $edx, 12 /* clobber */, implicit-def dead early-clobber $esi, 12 /* clobber */, implicit-def dead early-clobber $edi, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags
+    %4:gr8 = COPY %0.sub_8bit
+    TEST8rr killed %4, %4, implicit-def $eflags
+    JCC_1 %bb.2, 5, implicit killed $eflags
+    JMP_1 %bb.1
+
+  bb.1:
+    %1:gr32 = MOV32ri 1
+    %5:gr32 = COPY killed %1
+    JMP_1 %bb.3
+
+  bb.2:
+    %5:gr32 = COPY killed %0
+
+  bb.3:
+    %3:gr32 = COPY killed %5
+    $eax = COPY killed %3
+    RET 0, killed $eax
+
+...
diff --git a/llvm/test/CodeGen/X86/fma_patterns.ll b/llvm/test/CodeGen/X86/fma_patterns.ll
index 0ffcb8c..dc35c8f 100644
--- a/llvm/test/CodeGen/X86/fma_patterns.ll
+++ b/llvm/test/CodeGen/X86/fma_patterns.ll
@@ -1567,7 +1567,7 @@ define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x do
 ; Pattern: (fneg (fma x, y, z)) -> (fma x, -y, -z)
 ;
 
-define <4 x float> @test_v4f32_fneg_fmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+define <4 x float> @test_v4f32_fneg_fmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
 ; FMA-LABEL: test_v4f32_fneg_fmadd:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
@@ -1582,13 +1582,13 @@ define <4 x float> @test_v4f32_fneg_fmadd(<4 x float> %a0, <4 x float> %a1, <4 x
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
 ; AVX512-NEXT:    retq
-  %mul = fmul nsz <4 x float> %a0, %a1
-  %add = fadd nsz <4 x float> %mul, %a2
-  %neg = fsub nsz <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %add
+  %mul = fmul contract nsz <4 x float> %a0, %a1
+  %add = fadd contract nsz <4 x float> %mul, %a2
+  %neg = fsub contract nsz <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %add
   ret <4 x float> %neg
 }
 
-define <4 x double> @test_v4f64_fneg_fmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
+define <4 x double> @test_v4f64_fneg_fmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
 ; FMA-LABEL: test_v4f64_fneg_fmsub:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
@@ -1609,7 +1609,7 @@ define <4 x double> @test_v4f64_fneg_fmsub(<4 x double> %a0, <4 x double> %a1, <
   ret <4 x double> %neg
 }
 
-define <4 x float> @test_v4f32_fneg_fnmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+define <4 x float> @test_v4f32_fneg_fnmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
 ; FMA-LABEL: test_v4f32_fneg_fnmadd:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
@@ -1624,14 +1624,14 @@ define <4 x float> @test_v4f32_fneg_fnmadd(<4 x float> %a0, <4 x float> %a1, <4
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
 ; AVX512-NEXT:    retq
-  %mul = fmul nsz <4 x float> %a0, %a1
-  %neg0 = fsub nsz <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %mul
-  %add = fadd nsz <4 x float> %neg0, %a2
-  %neg1 = fsub nsz <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %add
+  %mul = fmul contract nsz <4 x float> %a0, %a1
+  %neg0 = fsub contract nsz <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %mul
+  %add = fadd contract nsz <4 x float> %neg0, %a2
+  %neg1 = fsub contract nsz <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %add
   ret <4 x float> %neg1
 }
 
-define <4 x double> @test_v4f64_fneg_fnmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
+define <4 x double> @test_v4f64_fneg_fnmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
 ; FMA-LABEL: test_v4f64_fneg_fnmsub:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
@@ -1646,10 +1646,10 @@ define <4 x double> @test_v4f64_fneg_fnmsub(<4 x double> %a0, <4 x double> %a1,
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
 ; AVX512-NEXT:    retq
-  %mul = fmul nsz  <4 x double> %a0, %a1
-  %neg0 = fsub nsz <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %mul
-  %sub = fsub nsz <4 x double> %neg0, %a2
-  %neg1 = fsub nsz <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %sub
+  %mul = fmul contract nsz  <4 x double> %a0, %a1
+  %neg0 = fsub contract nsz <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %mul
+  %sub = fsub contract nsz <4 x double> %neg0, %a2
+  %neg1 = fsub contract nsz <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %sub
   ret <4 x double> %neg1
 }
 
@@ -1657,7 +1657,7 @@ define <4 x double> @test_v4f64_fneg_fnmsub(<4 x double> %a0, <4 x double> %a1,
 ; Pattern: (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
 ;
 
-define <4 x float> @test_v4f32_fma_x_c1_fmul_x_c2(<4 x float> %x) #0 {
+define <4 x float> @test_v4f32_fma_x_c1_fmul_x_c2(<4 x float> %x) {
 ; FMA-LABEL: test_v4f32_fma_x_c1_fmul_x_c2:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1672,9 +1672,9 @@ define <4 x float> @test_v4f32_fma_x_c1_fmul_x_c2(<4 x float> %x) #0 {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
-  %m0 = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
-  %m1 = fmul <4 x float> %x, <float 4.0, float 3.0, float 2.0, float 1.0>
-  %a  = fadd <4 x float> %m0, %m1
+  %m0 = fmul contract reassoc <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
+  %m1 = fmul contract reassoc <4 x float> %x, <float 4.0, float 3.0, float 2.0, float 1.0>
+  %a  = fadd contract reassoc <4 x float> %m0, %m1
   ret <4 x float> %a
 }
 
@@ -1682,7 +1682,7 @@ define <4 x float> @test_v4f32_fma_x_c1_fmul_x_c2(<4 x float> %x) #0 {
 ; Pattern: (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
 ;
 
-define <4 x float> @test_v4f32_fma_fmul_x_c1_c2_y(<4 x float> %x, <4 x float> %y) #0 {
+define <4 x float> @test_v4f32_fma_fmul_x_c1_c2_y(<4 x float> %x, <4 x float> %y) {
 ; FMA-LABEL: test_v4f32_fma_fmul_x_c1_c2_y:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * mem) + xmm1
@@ -1697,15 +1697,15 @@ define <4 x float> @test_v4f32_fma_fmul_x_c1_c2_y(<4 x float> %x, <4 x float> %y
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * mem) + xmm1
 ; AVX512-NEXT:    retq
-  %m0 = fmul <4 x float> %x,  <float 1.0, float 2.0, float 3.0, float 4.0>
-  %m1 = fmul <4 x float> %m0, <float 4.0, float 3.0, float 2.0, float 1.0>
-  %a  = fadd <4 x float> %m1, %y
+  %m0 = fmul contract reassoc <4 x float> %x,  <float 1.0, float 2.0, float 3.0, float 4.0>
+  %m1 = fmul contract reassoc <4 x float> %m0, <float 4.0, float 3.0, float 2.0, float 1.0>
+  %a  = fadd contract reassoc <4 x float> %m1, %y
   ret <4 x float> %a
 }
 
 ; Pattern: (fneg (fmul x, y)) -> (fnmsub x, y, 0)
 
-define double @test_f64_fneg_fmul(double %x, double %y) #0 {
+define double @test_f64_fneg_fmul(double %x, double %y) {
 ; FMA-LABEL: test_f64_fneg_fmul:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
@@ -1723,12 +1723,12 @@ define double @test_f64_fneg_fmul(double %x, double %y) #0 {
 ; AVX512-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; AVX512-NEXT:    vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
 ; AVX512-NEXT:    retq
-  %m = fmul nsz double %x, %y
-  %n = fsub double -0.0, %m
+  %m = fmul contract nsz double %x, %y
+  %n = fsub contract double -0.0, %m
   ret double %n
 }
 
-define <4 x float> @test_v4f32_fneg_fmul(<4 x float> %x, <4 x float> %y) #0 {
+define <4 x float> @test_v4f32_fneg_fmul(<4 x float> %x, <4 x float> %y) {
 ; FMA-LABEL: test_v4f32_fneg_fmul:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vxorps %xmm2, %xmm2, %xmm2
@@ -1746,12 +1746,12 @@ define <4 x float> @test_v4f32_fneg_fmul(<4 x float> %x, <4 x float> %y) #0 {
 ; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; AVX512-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
 ; AVX512-NEXT:    retq
-  %m = fmul nsz <4 x float> %x, %y
-  %n = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %m
+  %m = fmul contract nsz <4 x float> %x, %y
+  %n = fsub contract <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %m
   ret <4 x float> %n
 }
 
-define <4 x double> @test_v4f64_fneg_fmul(<4 x double> %x, <4 x double> %y) #0 {
+define <4 x double> @test_v4f64_fneg_fmul(<4 x double> %x, <4 x double> %y) {
 ; FMA-LABEL: test_v4f64_fneg_fmul:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
@@ -1769,12 +1769,12 @@ define <4 x double> @test_v4f64_fneg_fmul(<4 x double> %x, <4 x double> %y) #0 {
 ; AVX512-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; AVX512-NEXT:    vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2
 ; AVX512-NEXT:    retq
-  %m = fmul nsz <4 x double> %x, %y
-  %n = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %m
+  %m = fmul contract nsz <4 x double> %x, %y
+  %n = fsub contract <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %m
   ret <4 x double> %n
 }
 
-define <4 x double> @test_v4f64_fneg_fmul_no_nsz(<4 x double> %x, <4 x double> %y) #0 {
+define <4 x double> @test_v4f64_fneg_fmul_no_nsz(<4 x double> %x, <4 x double> %y) {
 ; FMA-LABEL: test_v4f64_fneg_fmul_no_nsz:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
@@ -1792,8 +1792,8 @@ define <4 x double> @test_v4f64_fneg_fmul_no_nsz(<4 x double> %x, <4 x double> %
 ; AVX512-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
-  %m = fmul <4 x double> %x, %y
-  %n = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %m
+  %m = fmul contract <4 x double> %x, %y
+  %n = fsub contract <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %m
   ret <4 x double> %n
 }
 
@@ -2022,5 +2022,3 @@ define float @fadd_fma_fmul_extra_use_3(float %a, float %b, float %c, float %d,
   %a2 = fadd fast float %n0, %a1
   ret float %a2
 }
-
-attributes #0 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/fma_patterns_wide.ll b/llvm/test/CodeGen/X86/fma_patterns_wide.ll
index fe5ddca..d910110 100644
--- a/llvm/test/CodeGen/X86/fma_patterns_wide.ll
+++ b/llvm/test/CodeGen/X86/fma_patterns_wide.ll
@@ -1053,9 +1053,9 @@ define <16 x float> @test_v16f32_fma_x_c1_fmul_x_c2(<16 x float> %x) #0 {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
 ; AVX512-NEXT:    retq
-  %m0 = fmul <16 x float> %x, <float 17.0, float 16.0, float 15.0, float 14.0, float 13.0, float 12.0, float 11.0, float 10.0, float 9.0, float 8.0, float 7.0, float 6.0, float 5.0, float 4.0, float 3.0, float 2.0>
-  %m1 = fmul <16 x float> %x, <float 16.0, float 15.0, float 14.0, float 13.0, float 12.0, float 11.0, float 10.0, float 9.0, float 8.0, float 7.0, float 6.0, float 5.0, float 4.0, float 3.0, float 2.0, float 1.0>
-  %a  = fadd <16 x float> %m0, %m1
+  %m0 = fmul contract reassoc <16 x float> %x, <float 17.0, float 16.0, float 15.0, float 14.0, float 13.0, float 12.0, float 11.0, float 10.0, float 9.0, float 8.0, float 7.0, float 6.0, float 5.0, float 4.0, float 3.0, float 2.0>
+  %m1 = fmul contract reassoc <16 x float> %x, <float 16.0, float 15.0, float 14.0, float 13.0, float 12.0, float 11.0, float 10.0, float 9.0, float 8.0, float 7.0, float 6.0, float 5.0, float 4.0, float 3.0, float 2.0, float 1.0>
+  %a  = fadd contract reassoc <16 x float> %m0, %m1
   ret <16 x float> %a
 }
 
@@ -1080,9 +1080,9 @@ define <16 x float> @test_v16f32_fma_fmul_x_c1_c2_y(<16 x float> %x, <16 x float
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * mem) + zmm1
 ; AVX512-NEXT:    retq
-  %m0 = fmul <16 x float> %x,  <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>
-  %m1 = fmul <16 x float> %m0, <float 16.0, float 15.0, float 14.0, float 13.0, float 12.0, float 11.0, float 10.0, float 9.0, float 8.0, float 7.0, float 6.0, float 5.0, float 4.0, float 3.0, float 2.0, float 1.0>
-  %a  = fadd <16 x float> %m1, %y
+  %m0 = fmul contract reassoc <16 x float> %x,  <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>
+  %m1 = fmul contract reassoc <16 x float> %m0, <float 16.0, float 15.0, float 14.0, float 13.0, float 12.0, float 11.0, float 10.0, float 9.0, float 8.0, float 7.0, float 6.0, float 5.0, float 4.0, float 3.0, float 2.0, float 1.0>
+  %a  = fadd contract reassoc <16 x float> %m1, %y
   ret <16 x float> %a
 }
 
diff --git a/llvm/test/CodeGen/X86/fp-double-rounding.ll b/llvm/test/CodeGen/X86/fp-double-rounding.ll
index 543908a..88b20ff 100644
--- a/llvm/test/CodeGen/X86/fp-double-rounding.ll
+++ b/llvm/test/CodeGen/X86/fp-double-rounding.ll
@@ -1,20 +1,53 @@
-; RUN: llc < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SAFE
-; RUN: llc < %s -enable-unsafe-fp-math | FileCheck %s --check-prefix=CHECK --check-prefix=UNSAFE
+; RUN: llc < %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64--"
 
+; CHECK-LABEL: double_rounding_safe:
+; CHECK: callq __trunctfdf2
+; CHECK-NEXT: cvtsd2ss %xmm0
+define void @double_rounding_safe(ptr %x, ptr %f) {
+entry:
+  %x.fp128 = load fp128, ptr %x, align 16
+  %x.double = fptrunc fp128 %x.fp128 to double
+  %x.float = fptrunc double %x.double to float
+  store float %x.float, ptr %f, align 4
+  ret void
+}
+
+; CHECK-LABEL: double_rounding_contract_fst:
+; CHECK: callq __trunctfdf2
+; CHECK-NEXT: cvtsd2ss %xmm0
+define void @double_rounding_contract_fst(ptr %x, ptr %f) {
+entry:
+  %x.fp128 = load fp128, ptr %x, align 16
+  %x.double = fptrunc contract fp128 %x.fp128 to double
+  %x.float = fptrunc double %x.double to float
+  store float %x.float, ptr %f, align 4
+  ret void
+}
+
+; CHECK-LABEL: double_rounding_contract_snd:
+; CHECK: callq __trunctfdf2
+; CHECK-NEXT: cvtsd2ss %xmm0
+define void @double_rounding_contract_snd(ptr %x, ptr %f) {
+entry:
+  %x.fp128 = load fp128, ptr %x, align 16
+  %x.double = fptrunc fp128 %x.fp128 to double
+  %x.float = fptrunc contract double %x.double to float
+  store float %x.float, ptr %f, align 4
+  ret void
+}
+
 ; CHECK-LABEL: double_rounding:
-; SAFE: callq __trunctfdf2
-; SAFE-NEXT: cvtsd2ss %xmm0
-; UNSAFE: callq __trunctfsf2
-; UNSAFE-NOT: cvt
+; CHECK: callq __trunctfsf2
+; CHECK-NOT: cvt
 define void @double_rounding(ptr %x, ptr %f) {
 entry:
-  %0 = load fp128, ptr %x, align 16
-  %1 = fptrunc fp128 %0 to double
-  %2 = fptrunc double %1 to float
-  store float %2, ptr %f, align 4
+  %x.fp128 = load fp128, ptr %x, align 16
+  %x.double = fptrunc contract fp128 %x.fp128 to double
+  %x.float = fptrunc contract double %x.double to float
+  store float %x.float, ptr %f, align 4
   ret void
 }
 
diff --git a/llvm/test/CodeGen/X86/fp16-reload.mir b/llvm/test/CodeGen/X86/fp16-reload.mir
new file mode 100644
index 0000000..ddbd48c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fp16-reload.mir
@@ -0,0 +1,34 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=x86_64-unknown -start-before=twoaddressinstruction -stop-after=postrapseudos -verify-machineinstrs -o - %s | FileCheck %s
+
+...
+---
+name:            test
+alignment:       16
+tracksRegLiveness: true
+debugInstrRef:   true
+registers:
+liveins:
+  - { reg: '$xmm0', virtual-reg: '%0' }
+frameInfo:
+  maxAlignment:    1
+  hasCalls:        true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $xmm0
+
+    ; CHECK-LABEL: name: test
+    ; CHECK: liveins: $xmm0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: MOVSSmr $rsp, 1, $noreg, -4, $noreg, $xmm0 :: (store (s32) into %stack.0, align 2)
+    ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $xmm0, 12 /* clobber */, implicit-def dead early-clobber $xmm1, 12 /* clobber */, implicit-def dead early-clobber $xmm2, 12 /* clobber */, implicit-def dead early-clobber $xmm3, 12 /* clobber */, implicit-def dead early-clobber $xmm4, 12 /* clobber */, implicit-def dead early-clobber $xmm5, 12 /* clobber */, implicit-def dead early-clobber $xmm6, 12 /* clobber */, implicit-def dead early-clobber $xmm7, 12 /* clobber */, implicit-def dead early-clobber $xmm8, 12 /* clobber */, implicit-def dead early-clobber $xmm9, 12 /* clobber */, implicit-def dead early-clobber $xmm10, 12 /* clobber */, implicit-def dead early-clobber $xmm11, 12 /* clobber */, implicit-def dead early-clobber $xmm12, 12 /* clobber */, implicit-def dead early-clobber $xmm13, 12 /* clobber */, implicit-def dead early-clobber $xmm14, 12 /* clobber */, implicit-def dead early-clobber $xmm15, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags
+    ; CHECK-NEXT: renamable $xmm0 = MOVSSrm $rsp, 1, $noreg, -4, $noreg :: (load (s32) from %stack.0, align 2)
+    ; CHECK-NEXT: FNOP implicit-def $fpsw, implicit killed renamable $xmm0
+    ; CHECK-NEXT: RET 0
+    %0:fr16 = COPY killed $xmm0
+    INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $xmm0, 12 /* clobber */, implicit-def dead early-clobber $xmm1, 12 /* clobber */, implicit-def dead early-clobber $xmm2, 12 /* clobber */, implicit-def dead early-clobber $xmm3, 12 /* clobber */, implicit-def dead early-clobber $xmm4, 12 /* clobber */, implicit-def dead early-clobber $xmm5, 12 /* clobber */, implicit-def dead early-clobber $xmm6, 12 /* clobber */, implicit-def dead early-clobber $xmm7, 12 /* clobber */, implicit-def dead early-clobber $xmm8, 12 /* clobber */, implicit-def dead early-clobber $xmm9, 12 /* clobber */, implicit-def dead early-clobber $xmm10, 12 /* clobber */, implicit-def dead early-clobber $xmm11, 12 /* clobber */, implicit-def dead early-clobber $xmm12, 12 /* clobber */, implicit-def dead early-clobber $xmm13, 12 /* clobber */, implicit-def dead early-clobber $xmm14, 12 /* clobber */, implicit-def dead early-clobber $xmm15, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags
+    FNOP implicit-def $fpsw, implicit %0:fr16
+    RET 0
+
+...
diff --git a/llvm/test/CodeGen/X86/fp16-spill.ll b/llvm/test/CodeGen/X86/fp16-spill.ll
new file mode 100644
index 0000000..6161009
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fp16-spill.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -verify-machineinstrs | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -verify-machineinstrs | FileCheck %s --check-prefixes=AVX512
+
+define half @test(float %f, ptr %p) nounwind {
+; SSE2-LABEL: test:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    subq $16, %rsp
+; SSE2-NEXT:    movq %rdi, %rbx
+; SSE2-NEXT:    callq __truncsfhf2@PLT
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    callq __extendhfsf2@PLT
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    #APP
+; SSE2-NEXT:    #NO_APP
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss %xmm0, (%rbx)
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    addq $16, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test:
+; AVX:       # %bb.0:
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    subq $16, %rsp
+; AVX-NEXT:    movq %rdi, %rbx
+; AVX-NEXT:    callq __truncsfhf2@PLT
+; AVX-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX-NEXT:    callq __extendhfsf2@PLT
+; AVX-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX-NEXT:    #APP
+; AVX-NEXT:    #NO_APP
+; AVX-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; AVX-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss %xmm0, (%rbx)
+; AVX-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; AVX-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    addq $16, %rsp
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    #APP
+; AVX512-NEXT:    #NO_APP
+; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vmovss %xmm0, (%rdi)
+; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    retq
+  %t = fptrunc float %f to half
+  %t2 = fpext half %t to float
+  tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"()
+  store float %t2, ptr %p
+  ret half %t
+}
diff --git a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll
index 7f6d64c..a251e43 100644
--- a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll
+++ b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll
@@ -57,6 +57,52 @@ define <4 x i1> @test_signed_v4i1_v4f32(<4 x float> %f) nounwind {
   ret <4 x i1> %x
 }
 
+define <4 x i1> @test_freeze_signed_v4i1_v4f32(<4 x float> %f) nounwind {
+; CHECK-LABEL: test_freeze_signed_v4i1_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    ucomiss %xmm1, %xmm1
+; CHECK-NEXT:    maxss %xmm2, %xmm1
+; CHECK-NEXT:    xorps %xmm3, %xmm3
+; CHECK-NEXT:    minss %xmm3, %xmm1
+; CHECK-NEXT:    cvttss2si %xmm1, %ecx
+; CHECK-NEXT:    cmovpl %eax, %ecx
+; CHECK-NEXT:    movd %ecx, %xmm1
+; CHECK-NEXT:    movaps %xmm0, %xmm4
+; CHECK-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
+; CHECK-NEXT:    ucomiss %xmm4, %xmm4
+; CHECK-NEXT:    maxss %xmm2, %xmm4
+; CHECK-NEXT:    minss %xmm3, %xmm4
+; CHECK-NEXT:    cvttss2si %xmm4, %ecx
+; CHECK-NEXT:    cmovpl %eax, %ecx
+; CHECK-NEXT:    movd %ecx, %xmm4
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    maxss %xmm2, %xmm1
+; CHECK-NEXT:    minss %xmm3, %xmm1
+; CHECK-NEXT:    cvttss2si %xmm1, %ecx
+; CHECK-NEXT:    ucomiss %xmm0, %xmm0
+; CHECK-NEXT:    cmovpl %eax, %ecx
+; CHECK-NEXT:    movd %ecx, %xmm1
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-NEXT:    ucomiss %xmm0, %xmm0
+; CHECK-NEXT:    maxss %xmm2, %xmm0
+; CHECK-NEXT:    minss %xmm3, %xmm0
+; CHECK-NEXT:    cvttss2si %xmm0, %ecx
+; CHECK-NEXT:    cmovpl %eax, %ecx
+; CHECK-NEXT:    movd %ecx, %xmm0
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %x = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f32(<4 x float> %f)
+  %y = freeze <4 x i1> %x
+  ret <4 x i1> %y
+}
+
 define <4 x i8> @test_signed_v4i8_v4f32(<4 x float> %f) nounwind {
 ; CHECK-LABEL: test_signed_v4i8_v4f32:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll
index ffbdd66..7b1db5c 100644
--- a/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll
+++ b/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll
@@ -48,6 +48,43 @@ define <4 x i1> @test_unsigned_v4i1_v4f32(<4 x float> %f) nounwind {
   ret <4 x i1> %x
 }
 
+define <4 x i1> @test_freeze_unsigned_v4i1_v4f32(<4 x float> %f) nounwind {
+; CHECK-LABEL: test_freeze_unsigned_v4i1_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
+; CHECK-NEXT:    xorps %xmm2, %xmm2
+; CHECK-NEXT:    maxss %xmm2, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    minss %xmm3, %xmm1
+; CHECK-NEXT:    cvttss2si %xmm1, %eax
+; CHECK-NEXT:    movd %eax, %xmm1
+; CHECK-NEXT:    movaps %xmm0, %xmm4
+; CHECK-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
+; CHECK-NEXT:    maxss %xmm2, %xmm4
+; CHECK-NEXT:    minss %xmm3, %xmm4
+; CHECK-NEXT:    cvttss2si %xmm4, %eax
+; CHECK-NEXT:    movd %eax, %xmm4
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    maxss %xmm2, %xmm1
+; CHECK-NEXT:    minss %xmm3, %xmm1
+; CHECK-NEXT:    cvttss2si %xmm1, %eax
+; CHECK-NEXT:    movd %eax, %xmm1
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-NEXT:    maxss %xmm2, %xmm0
+; CHECK-NEXT:    minss %xmm3, %xmm0
+; CHECK-NEXT:    cvttss2si %xmm0, %eax
+; CHECK-NEXT:    movd %eax, %xmm0
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %x = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f32(<4 x float> %f)
+  %y = freeze <4 x i1> %x
+  ret <4 x i1> %y
+}
+
 define <4 x i8> @test_unsigned_v4i8_v4f32(<4 x float> %f) nounwind {
 ; CHECK-LABEL: test_unsigned_v4i8_v4f32:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll
index 962ffe4..e223765 100644
--- a/llvm/test/CodeGen/X86/freeze-binary.ll
+++ b/llvm/test/CodeGen/X86/freeze-binary.ll
@@ -924,6 +924,76 @@ define i32 @freeze_usubo(i32 %a0, i32 %a1, i8 %a2, i8 %a3) nounwind {
   ret i32 %r
 }
 
+define i32 @freeze_scmp(i32 %a0) nounwind {
+; X86-LABEL: freeze_scmp:
+; X86:       # %bb.0:
+; X86-NEXT:    movl $2, %eax
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movsbl %cl, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movsbl %cl, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: freeze_scmp:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $2, %eax
+; X64-NEXT:    cmpl %edi, %eax
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbl %cl, %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    setl %al
+; X64-NEXT:    setg %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    movsbl %cl, %eax
+; X64-NEXT:    retq
+  %x = call i32 @llvm.scmp.i32(i32 2, i32 %a0)
+  %y = freeze i32 %x
+  %z = call i32 @llvm.scmp.i32(i32 0, i32 %y)
+  ret i32 %z
+}
+
+define i32 @freeze_ucmp(i32 %a0) nounwind {
+; X86-LABEL: freeze_ucmp:
+; X86:       # %bb.0:
+; X86-NEXT:    movl $2, %eax
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    cmpl $2, %eax
+; X86-NEXT:    setae %cl
+; X86-NEXT:    cmpl $1, %eax
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movsbl %cl, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: freeze_ucmp:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $2, %eax
+; X64-NEXT:    cmpl %edi, %eax
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbb $0, %al
+; X64-NEXT:    movsbl %al, %eax
+; X64-NEXT:    cmpl $2, %eax
+; X64-NEXT:    setae %cl
+; X64-NEXT:    cmpl $1, %eax
+; X64-NEXT:    sbbb $0, %cl
+; X64-NEXT:    movsbl %cl, %eax
+; X64-NEXT:    retq
+  %x = call i32 @llvm.ucmp.i32(i32 2, i32 %a0)
+  %y = freeze i32 %x
+  %z = call i32 @llvm.ucmp.i32(i32 %y, i32 1)
+  ret i32 %z
+}
+
 define void @pr59676_frozen(ptr %dst, i32 %x.orig) {
 ; X86-LABEL: pr59676_frozen:
 ; X86:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/freeze.ll b/llvm/test/CodeGen/X86/freeze.ll
index 3196f81..38e3e23 100644
--- a/llvm/test/CodeGen/X86/freeze.ll
+++ b/llvm/test/CodeGen/X86/freeze.ll
@@ -141,3 +141,48 @@ entry:
   %z = urem i32 %y, 10
   ret i32 %z
 }
+
+; Make sure we don't crash when replacing all uses of N with an existing freeze N.
+
+define i64 @pr155345(ptr %p1, i1 %cond, ptr %p2, ptr %p3) {
+; X86ASM-LABEL: pr155345:
+; X86ASM:       # %bb.0: # %entry
+; X86ASM-NEXT:    movzbl (%rdi), %edi
+; X86ASM-NEXT:    xorl %eax, %eax
+; X86ASM-NEXT:    orb $1, %dil
+; X86ASM-NEXT:    movb %dil, (%rdx)
+; X86ASM-NEXT:    movzbl %dil, %edx
+; X86ASM-NEXT:    cmovel %edx, %eax
+; X86ASM-NEXT:    sete %dil
+; X86ASM-NEXT:    testb $1, %sil
+; X86ASM-NEXT:    cmovnel %edx, %eax
+; X86ASM-NEXT:    movb %dl, (%rcx)
+; X86ASM-NEXT:    movl $1, %edx
+; X86ASM-NEXT:    movl %eax, %ecx
+; X86ASM-NEXT:    shlq %cl, %rdx
+; X86ASM-NEXT:    orb %sil, %dil
+; X86ASM-NEXT:    movzbl %dil, %eax
+; X86ASM-NEXT:    andl %edx, %eax
+; X86ASM-NEXT:    andl $1, %eax
+; X86ASM-NEXT:    retq
+entry:
+  %load1 = load i8, ptr %p1, align 1
+  %v1 = or i8 %load1, 1
+  %v2 = zext i8 %v1 to i32
+  store i8 %v1, ptr %p2, align 1
+  %v3 = load i8, ptr %p2, align 1
+  %ext1 = sext i8 %v3 to i64
+  %ext2 = zext i32 %v2 to i64
+  %cmp1 = icmp ult i64 0, %ext1
+  %v4 = select i1 %cond, i1 false, i1 %cmp1
+  %sel1 = select i1 %v4, i64 0, i64 %ext2
+  %shl = shl i64 1, %sel1
+  store i8 %v1, ptr %p3, align 1
+  %v5 = load i8, ptr %p3, align 1
+  %ext3 = sext i8 %v5 to i64
+  %cmp2 = icmp ult i64 0, %ext3
+  %v6 = select i1 %cond, i1 false, i1 %cmp2
+  %sel2 = select i1 %v6, i64 0, i64 1
+  %and = and i64 %sel2, %shl
+  ret i64 %and
+}
diff --git a/llvm/test/CodeGen/X86/fshl-fshr-constant.ll b/llvm/test/CodeGen/X86/fshl-fshr-constant.ll
new file mode 100644
index 0000000..fdc34f5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fshl-fshr-constant.ll
@@ -0,0 +1,149 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,CHECK-EXPAND
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512vbmi2 | FileCheck %s --check-prefixes=CHECK,CHECK-UNEXPAND
+
+define <4 x i32> @test_fshl_constants() {
+; CHECK-EXPAND-LABEL: test_fshl_constants:
+; CHECK-EXPAND:       # %bb.0:
+; CHECK-EXPAND-NEXT:    vmovaps {{.*#+}} xmm0 = [0,512,2048,6144]
+; CHECK-EXPAND-NEXT:    retq
+;
+; CHECK-UNEXPAND-LABEL: test_fshl_constants:
+; CHECK-UNEXPAND:       # %bb.0:
+; CHECK-UNEXPAND-NEXT:    vpmovsxwd {{.*#+}} xmm0 = [0,512,2048,6144]
+; CHECK-UNEXPAND-NEXT:    retq
+  %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> <i32 4, i32 5, i32 6, i32 7>, <4 x i32> <i32 8, i32 9, i32 10, i32 11>)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_fshl_splat_constants() {
+; CHECK-LABEL: test_fshl_splat_constants:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm0 = [256,256,256,256]
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> <i32 4, i32 4, i32 4, i32 4>, <4 x i32> <i32 8, i32 8, i32 8, i32 8>)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_fshl_two_constants(<4 x i32> %a) {
+; CHECK-EXPAND-LABEL: test_fshl_two_constants:
+; CHECK-EXPAND:       # %bb.0:
+; CHECK-EXPAND-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-EXPAND-NEXT:    retq
+;
+; CHECK-UNEXPAND-LABEL: test_fshl_two_constants:
+; CHECK-UNEXPAND:       # %bb.0:
+; CHECK-UNEXPAND-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,5,6,7]
+; CHECK-UNEXPAND-NEXT:    vpshldvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
+; CHECK-UNEXPAND-NEXT:    retq
+  %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>, <4 x i32> <i32 8, i32 9, i32 10, i32 11>)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_fshl_one_constant(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-EXPAND-LABEL: test_fshl_one_constant:
+; CHECK-EXPAND:       # %bb.0:
+; CHECK-EXPAND-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-EXPAND-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-EXPAND-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-EXPAND-NEXT:    retq
+;
+; CHECK-UNEXPAND-LABEL: test_fshl_one_constant:
+; CHECK-UNEXPAND:       # %bb.0:
+; CHECK-UNEXPAND-NEXT:    vpshldvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
+; CHECK-UNEXPAND-NEXT:    retq
+  %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 8, i32 9, i32 10, i32 11>)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_fshl_none_constant(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-EXPAND-LABEL: test_fshl_none_constant:
+; CHECK-EXPAND:       # %bb.0:
+; CHECK-EXPAND-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; CHECK-EXPAND-NEXT:    vpandn %xmm3, %xmm2, %xmm4
+; CHECK-EXPAND-NEXT:    vpsrld $1, %xmm1, %xmm1
+; CHECK-EXPAND-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
+; CHECK-EXPAND-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; CHECK-EXPAND-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
+; CHECK-EXPAND-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-EXPAND-NEXT:    retq
+;
+; CHECK-UNEXPAND-LABEL: test_fshl_none_constant:
+; CHECK-UNEXPAND:       # %bb.0:
+; CHECK-UNEXPAND-NEXT:    vpshldvd %xmm2, %xmm1, %xmm0
+; CHECK-UNEXPAND-NEXT:    retq
+  %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_fshr_constants() {
+; CHECK-LABEL: test_fshr_constants:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [0,8388608,8388608,6291456]
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> <i32 4, i32 5, i32 6, i32 7>, <4 x i32> <i32 8, i32 9, i32 10, i32 11>)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_fshr_two_constants(<4 x i32> %a) {
+; CHECK-EXPAND-LABEL: test_fshr_two_constants:
+; CHECK-EXPAND:       # %bb.0:
+; CHECK-EXPAND-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-EXPAND-NEXT:    retq
+;
+; CHECK-UNEXPAND-LABEL: test_fshr_two_constants:
+; CHECK-UNEXPAND:       # %bb.0:
+; CHECK-UNEXPAND-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,5,6,7]
+; CHECK-UNEXPAND-NEXT:    vpshrdvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; CHECK-UNEXPAND-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-UNEXPAND-NEXT:    retq
+  %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>, <4 x i32> <i32 8, i32 9, i32 10, i32 11>)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_fshr_one_constant(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-EXPAND-LABEL: test_fshr_one_constant:
+; CHECK-EXPAND:       # %bb.0:
+; CHECK-EXPAND-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-EXPAND-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-EXPAND-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-EXPAND-NEXT:    retq
+;
+; CHECK-UNEXPAND-LABEL: test_fshr_one_constant:
+; CHECK-UNEXPAND:       # %bb.0:
+; CHECK-UNEXPAND-NEXT:    vpshrdvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; CHECK-UNEXPAND-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-UNEXPAND-NEXT:    retq
+  %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 8, i32 9, i32 10, i32 11>)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_fshr_none_constant(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-EXPAND-LABEL: test_fshr_none_constant:
+; CHECK-EXPAND:       # %bb.0:
+; CHECK-EXPAND-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
+; CHECK-EXPAND-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; CHECK-EXPAND-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
+; CHECK-EXPAND-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; CHECK-EXPAND-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; CHECK-EXPAND-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
+; CHECK-EXPAND-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-EXPAND-NEXT:    retq
+;
+; CHECK-UNEXPAND-LABEL: test_fshr_none_constant:
+; CHECK-UNEXPAND:       # %bb.0:
+; CHECK-UNEXPAND-NEXT:    vpshrdvd %xmm2, %xmm0, %xmm1
+; CHECK-UNEXPAND-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-UNEXPAND-NEXT:    retq
+  %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_fshr_splat_constants() {
+; CHECK-LABEL: test_fshr_splat_constants:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm0 = [16777216,16777216,16777216,16777216]
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> <i32 4, i32 4, i32 4, i32 4>, <4 x i32> <i32 8, i32 8, i32 8, i32 8>)
+  ret <4 x i32> %res
+}
diff --git a/llvm/test/CodeGen/X86/function-align.ll b/llvm/test/CodeGen/X86/function-align.ll
deleted file mode 100644
index 11d0e99..0000000
--- a/llvm/test/CodeGen/X86/function-align.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc -function-sections < %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; CHECK: .section .text.f1
-; CHECK-NOT: .p2align
-; CHECK: f1:
-define void @f1() align 1 {
-  ret void
-}
-
-; CHECK: .section .text.f2
-; CHECK-NEXT: .globl f2
-; CHECK-NEXT: .p2align 1
-define void @f2() align 2 {
-  ret void
-}
diff --git a/llvm/test/CodeGen/X86/huge-stack.ll b/llvm/test/CodeGen/X86/huge-stack.ll
index 920033b..41b8a01 100644
--- a/llvm/test/CodeGen/X86/huge-stack.ll
+++ b/llvm/test/CodeGen/X86/huge-stack.ll
@@ -5,20 +5,70 @@
 define void @foo() unnamed_addr #0 {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movabsq $8589934462, %rax # imm = 0x1FFFFFF7E
+; CHECK-NEXT:    movabsq $8589934472, %rax # imm = 0x1FFFFFF88
 ; CHECK-NEXT:    subq %rax, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 8589934470
-; CHECK-NEXT:    movb $42, -129(%rsp)
-; CHECK-NEXT:    movb $43, -128(%rsp)
-; CHECK-NEXT:    movabsq $8589934462, %rax # imm = 0x1FFFFFF7E
+; CHECK-NEXT:    .cfi_def_cfa_offset 8589934480
+; CHECK-NEXT:    movabsq $4294967177, %rax # imm = 0xFFFFFF89
+; CHECK-NEXT:    movb $42, (%rsp,%rax)
+; CHECK-NEXT:    movb $43, -118(%rsp)
+; CHECK-NEXT:    movabsq $8589934472, %rax # imm = 0x1FFFFFF88
 ; CHECK-NEXT:    addq %rax, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
-  %1 = alloca %large, align 1
-  %2 = alloca %large, align 1
-  %3 = getelementptr inbounds %large, ptr %1, i64 0, i64 0
-  store i8 42, ptr %3, align 1
-  %4 = getelementptr inbounds %large, ptr %2, i64 0, i64 0
-  store i8 43, ptr %4, align 1
+  %large1 = alloca %large, align 1
+  %large2 = alloca %large, align 1
+  %ptrLarge1 = getelementptr inbounds %large, ptr %large1, i64 0, i64 0
+  store i8 42, ptr %ptrLarge1, align 1
+  %ptrLarge2 = getelementptr inbounds %large, ptr %large2, i64 0, i64 0
+  store i8 43, ptr %ptrLarge2, align 1
   ret void
 }
+
+declare ptr @baz(ptr, ptr, ptr, ptr)
+
+define ptr @scavenge_spill() unnamed_addr #0 {
+; CHECK-LABEL: scavenge_spill:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movabsq $25769803816, %rax # imm = 0x600000028
+; CHECK-NEXT:    subq %rax, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 25769803824
+; CHECK-NEXT:    movabsq $21474836521, %rax # imm = 0x500000029
+; CHECK-NEXT:    leaq (%rsp,%rax), %rdi
+; CHECK-NEXT:    movabsq $17179869226, %rax # imm = 0x40000002A
+; CHECK-NEXT:    leaq (%rsp,%rax), %rsi
+; CHECK-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movabsq $12884901931, %rax # imm = 0x30000002B
+; CHECK-NEXT:    leaq (%rsp,%rax), %rdx
+; CHECK-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movabsq $8589934636, %rax # imm = 0x20000002C
+; CHECK-NEXT:    leaq (%rsp,%rax), %rcx
+; CHECK-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    callq baz@PLT
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    leaq 46(%rsp), %rdi
+; CHECK-NEXT:    callq baz@PLT
+; CHECK-NEXT:    # kill: def $rcx killed $rax
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT:    movabsq $25769803816, %rcx # imm = 0x600000028
+; CHECK-NEXT:    addq %rcx, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %large1 = alloca %large, align 1
+  %ptrLarge1 = getelementptr inbounds %large, ptr %large1, i64 0, i64 0
+  %large2 = alloca %large, align 1
+  %ptrLarge2 = getelementptr inbounds %large, ptr %large2, i64 0, i64 0
+  %large3 = alloca %large, align 1
+  %ptrLarge3 = getelementptr inbounds %large, ptr %large3, i64 0, i64 0
+  %large4 = alloca %large, align 1
+  %ptrLarge4 = getelementptr inbounds %large, ptr %large4, i64 0, i64 0
+  %large5 = alloca %large, align 1
+  %ptrLarge5 = getelementptr inbounds %large, ptr %large5, i64 0, i64 0
+  %ret1 = call ptr @baz(ptr %ptrLarge1, ptr %ptrLarge2, ptr %ptrLarge3, ptr %ptrLarge4)
+  %large6 = alloca %large, align 1
+  %ptrLarge6 = getelementptr inbounds %large, ptr %large6, i64 0, i64 0
+  %ret2 = call ptr @baz(ptr %ptrLarge6, ptr %ptrLarge2, ptr %ptrLarge3, ptr %ptrLarge4)
+  ret ptr %ret1
+}
diff --git a/llvm/test/CodeGen/X86/ins_subreg_coalesce-3.ll b/llvm/test/CodeGen/X86/ins_subreg_coalesce-3.ll
index 3ac0fd7..eccb323 100644
--- a/llvm/test/CodeGen/X86/ins_subreg_coalesce-3.ll
+++ b/llvm/test/CodeGen/X86/ins_subreg_coalesce-3.ll
@@ -22,41 +22,45 @@ define void @FontChange(i1 %foo) nounwind {
 ; CHECK-LABEL: FontChange:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    testb $1, %dil
-; CHECK-NEXT:    je .LBB0_10
+; CHECK-NEXT:    je .LBB0_12
+; CHECK-NEXT:  # %bb.1: # %bb298
+; CHECK-NEXT:    je .LBB0_3
+; CHECK-NEXT:  # %bb.2: # %bb304
+; CHECK-NEXT:    je .LBB0_4
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_1: # %bb366
+; CHECK-NEXT:  .LBB0_3: # %bb366
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    testb $1, %dil
-; CHECK-NEXT:    jne .LBB0_1
-; CHECK-NEXT:  # %bb.2: # %bb428
+; CHECK-NEXT:    jne .LBB0_3
+; CHECK-NEXT:  .LBB0_4: # %bb428
 ; CHECK-NEXT:    testb $1, %dil
-; CHECK-NEXT:    je .LBB0_10
-; CHECK-NEXT:  # %bb.3:
+; CHECK-NEXT:    je .LBB0_12
+; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    cmpb $0, 0
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_4: # %bb650
+; CHECK-NEXT:  .LBB0_6: # %bb650
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    je .LBB0_4
-; CHECK-NEXT:  # %bb.5: # %bb662
+; CHECK-NEXT:    je .LBB0_6
+; CHECK-NEXT:  # %bb.7: # %bb662
 ; CHECK-NEXT:    movl 0, %eax
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    andl $57344, %ecx # imm = 0xE000
 ; CHECK-NEXT:    cmpl $8192, %ecx # imm = 0x2000
-; CHECK-NEXT:    jne .LBB0_10
-; CHECK-NEXT:  # %bb.6: # %bb4884
+; CHECK-NEXT:    jne .LBB0_12
+; CHECK-NEXT:  # %bb.8: # %bb4884
 ; CHECK-NEXT:    andl $7168, %eax # imm = 0x1C00
 ; CHECK-NEXT:    cmpl $1024, %eax # imm = 0x400
-; CHECK-NEXT:    jne .LBB0_10
-; CHECK-NEXT:  # %bb.7: # %bb4932
+; CHECK-NEXT:    jne .LBB0_12
+; CHECK-NEXT:  # %bb.9: # %bb4932
 ; CHECK-NEXT:    testb $1, %dil
-; CHECK-NEXT:    jne .LBB0_10
-; CHECK-NEXT:  # %bb.8: # %bb4940
+; CHECK-NEXT:    jne .LBB0_12
+; CHECK-NEXT:  # %bb.10: # %bb4940
 ; CHECK-NEXT:    movl 0, %eax
 ; CHECK-NEXT:    cmpl $160, %eax
-; CHECK-NEXT:    je .LBB0_10
-; CHECK-NEXT:  # %bb.9: # %bb4940
+; CHECK-NEXT:    je .LBB0_12
+; CHECK-NEXT:  # %bb.11: # %bb4940
 ; CHECK-NEXT:    cmpl $159, %eax
-; CHECK-NEXT:  .LBB0_10: # %bb4897
+; CHECK-NEXT:  .LBB0_12: # %bb4897
 ; CHECK-NEXT:    retq
 entry:
 	br i1 %foo, label %bb298, label %bb49
diff --git a/llvm/test/CodeGen/X86/kmov.ll b/llvm/test/CodeGen/X86/kmov.ll
index cab810d..8b1e69a 100644
--- a/llvm/test/CodeGen/X86/kmov.ll
+++ b/llvm/test/CodeGen/X86/kmov.ll
@@ -143,6 +143,57 @@ define <8 x i1> @invert_i8_mask_extract_8(i8 %mask) {
   ret <8 x i1> %cmp.45
 }
 
+define <8 x i1> @i8_mask_extract_7(i8 %mask) {
+; X64-AVX512-LABEL: i8_mask_extract_7:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    shrb %dil
+; X64-AVX512-NEXT:    movzbl %dil, %eax
+; X64-AVX512-NEXT:    kmovd %eax, %k0
+; X64-AVX512-NEXT:    vpmovm2w %k0, %xmm0
+; X64-AVX512-NEXT:    retq
+;
+; X64-KNL-LABEL: i8_mask_extract_7:
+; X64-KNL:       # %bb.0:
+; X64-KNL-NEXT:    vmovd %edi, %xmm0
+; X64-KNL-NEXT:    vpbroadcastb %xmm0, %xmm0
+; X64-KNL-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,4,8,16,32,64,128,0,2,4,8,16,32,64,128,0]
+; X64-KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; X64-KNL-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; X64-KNL-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X64-KNL-NEXT:    retq
+  %.splatinsert = insertelement <8 x i8> poison, i8 %mask, i64 0
+  %.splat = shufflevector <8 x i8> %.splatinsert, <8 x i8> poison, <8 x i32> zeroinitializer
+  %1 = and <8 x i8> %.splat, <i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 poison>
+  %cmp.45 = icmp ne <8 x i8> %1, zeroinitializer
+  ret <8 x i1> %cmp.45
+}
+
+define <8 x i1> @invert_i8_mask_extract_7(i8 %mask) {
+; X64-AVX512-LABEL: invert_i8_mask_extract_7:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    shrb %dil
+; X64-AVX512-NEXT:    movzbl %dil, %eax
+; X64-AVX512-NEXT:    kmovd %eax, %k0
+; X64-AVX512-NEXT:    knotb %k0, %k0
+; X64-AVX512-NEXT:    vpmovm2w %k0, %xmm0
+; X64-AVX512-NEXT:    retq
+;
+; X64-KNL-LABEL: invert_i8_mask_extract_7:
+; X64-KNL:       # %bb.0:
+; X64-KNL-NEXT:    vmovd %edi, %xmm0
+; X64-KNL-NEXT:    vpbroadcastb %xmm0, %xmm0
+; X64-KNL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X64-KNL-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; X64-KNL-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X64-KNL-NEXT:    retq
+  %.splatinsert = insertelement <8 x i8> poison, i8 %mask, i64 0
+  %.splat = shufflevector <8 x i8> %.splatinsert, <8 x i8> poison, <8 x i32> zeroinitializer
+  %1 = and <8 x i8> %.splat, <i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 poison>
+  %cmp.45 = icmp eq <8 x i8> %1, zeroinitializer
+  ret <8 x i1> %cmp.45
+}
+
 define <4 x i1> @i16_mask_extract_4(i16 %mask) {
 ; X64-AVX512-LABEL: i16_mask_extract_4:
 ; X64-AVX512:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/large-displacements-fastisel.ll b/llvm/test/CodeGen/X86/large-displacements-fastisel.ll
new file mode 100644
index 0000000..362b1b5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/large-displacements-fastisel.ll
@@ -0,0 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64 -O=0 -verify-machineinstrs | FileCheck %s
+@G = global i8 0
+
+; Regression test for PR113856 - incorrect FastISel assert
+
+define i32 @main() {
+; CHECK-LABEL: main:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movabsq $-2147483652, %rax # imm = 0xFFFFFFFF7FFFFFFC
+; CHECK-NEXT:    movl $0, (%rsp,%rax)
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+  %1 = alloca i32, align 4
+  %G = getelementptr i8, ptr %1, i32 -2147483648
+  store i32 0, ptr %G, align 4
+  ret i32 0
+}
diff --git a/llvm/test/CodeGen/X86/large-displacements.ll b/llvm/test/CodeGen/X86/large-displacements.ll
new file mode 100644
index 0000000..d7085a5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/large-displacements.ll
@@ -0,0 +1,82 @@
+; RUN: not llc < %s -mtriple=i686 -filetype=null -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=ERR-i686
+; RUN: llc < %s -mtriple=x86_64 -verify-machineinstrs | FileCheck %s -check-prefix=x86_64
+
+; Regression test for #121932, #113856, #106352, #69365, #25051 which are caused by
+; an incorrectly written assertion for 64-bit offsets when compiling for 32-bit X86.
+
+define i32 @main() #0 {
+; ERR-i686: error: <unknown>:0:0: 64-bit offset calculated but target is 32-bit
+;
+; x86_64-LABEL: main:
+; x86_64:       # %bb.0: # %entry
+; x86_64-NEXT:    movl $4294967192, %eax # imm = 0xFFFFFF98
+; x86_64-NEXT:    subq %rax, %rsp
+; x86_64-NEXT:    .cfi_def_cfa_offset 4294967200
+; x86_64-NEXT:    movabsq $3221225318, %rax # imm = 0xBFFFFF66
+; x86_64-NEXT:    movb $32, (%rsp,%rax)
+; x86_64-NEXT:    movb $33, 2147483494(%rsp)
+; x86_64-NEXT:    movb $34, 1073741670(%rsp)
+; x86_64-NEXT:    movb $35, -154(%rsp)
+; x86_64-NEXT:    xorl %eax, %eax
+; x86_64-NEXT:    movl $4294967192, %ecx # imm = 0xFFFFFF98
+; x86_64-NEXT:    addq %rcx, %rsp
+; x86_64-NEXT:    .cfi_def_cfa_offset 8
+; x86_64-NEXT:    retq
+entry:
+  %a = alloca [1073741824 x i8], align 16
+  %b = alloca [1073741824 x i8], align 16
+  %c = alloca [1073741824 x i8], align 16
+  %d = alloca [1073741824 x i8], align 16
+
+  %arrayida = getelementptr inbounds [1073741824 x i8], ptr %a, i64 0, i64 -42
+  %arrayidb = getelementptr inbounds [1073741824 x i8], ptr %b, i64 0, i64 -42
+  %arrayidc = getelementptr inbounds [1073741824 x i8], ptr %c, i64 0, i64 -42
+  %arrayidd = getelementptr inbounds [1073741824 x i8], ptr %d, i64 0, i64 -42
+
+  store i8 32, ptr %arrayida, align 2
+  store i8 33, ptr %arrayidb, align 2
+  store i8 34, ptr %arrayidc, align 2
+  store i8 35, ptr %arrayidd, align 2
+
+  ret i32 0
+}
+
+; Same test as above but for an anonymous function.
+define i32 @0() #0 {
+; ERR-i686: error: <unknown>:0:0: 64-bit offset calculated but target is 32-bit
+;
+; x86_64-LABEL: __unnamed_1:
+; x86_64:       # %bb.0: # %entry
+; x86_64-NEXT:    movl $4294967192, %eax # imm = 0xFFFFFF98
+; x86_64-NEXT:    subq %rax, %rsp
+; x86_64-NEXT:    .cfi_def_cfa_offset 4294967200
+; x86_64-NEXT:    movabsq $3221225318, %rax # imm = 0xBFFFFF66
+; x86_64-NEXT:    movb $32, (%rsp,%rax)
+; x86_64-NEXT:    movb $33, 2147483494(%rsp)
+; x86_64-NEXT:    movb $34, 1073741670(%rsp)
+; x86_64-NEXT:    movb $35, -154(%rsp)
+; x86_64-NEXT:    xorl %eax, %eax
+; x86_64-NEXT:    movl $4294967192, %ecx # imm = 0xFFFFFF98
+; x86_64-NEXT:    addq %rcx, %rsp
+; x86_64-NEXT:    .cfi_def_cfa_offset 8
+; x86_64-NEXT:    retq
+entry:
+  %a = alloca [1073741824 x i8], align 16
+  %b = alloca [1073741824 x i8], align 16
+  %c = alloca [1073741824 x i8], align 16
+  %d = alloca [1073741824 x i8], align 16
+
+  %arrayida = getelementptr inbounds [1073741824 x i8], ptr %a, i64 0, i64 -42
+  %arrayidb = getelementptr inbounds [1073741824 x i8], ptr %b, i64 0, i64 -42
+  %arrayidc = getelementptr inbounds [1073741824 x i8], ptr %c, i64 0, i64 -42
+  %arrayidd = getelementptr inbounds [1073741824 x i8], ptr %d, i64 0, i64 -42
+
+  store i8 32, ptr %arrayida, align 2
+  store i8 33, ptr %arrayidb, align 2
+  store i8 34, ptr %arrayidc, align 2
+  store i8 35, ptr %arrayidd, align 2
+
+  ret i32 0
+}
+
+attributes #0 = { optnone noinline }
diff --git a/llvm/test/CodeGen/X86/llrint-conv.ll b/llvm/test/CodeGen/X86/llrint-conv.ll
index 402daf8..7bcf573 100644
--- a/llvm/test/CodeGen/X86/llrint-conv.ll
+++ b/llvm/test/CodeGen/X86/llrint-conv.ll
@@ -7,14 +7,50 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefixes=X64,X64-AVX
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X64,X64-AVX
 
-define i64 @testmsxs(float %x) {
+define i64 @testmsxh(half %x) nounwind {
+; X86-NOSSE-LABEL: testmsxh:
+; X86-NOSSE:       # %bb.0: # %entry
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    calll __extendhfsf2
+; X86-NOSSE-NEXT:    fstps (%esp)
+; X86-NOSSE-NEXT:    calll llrintf
+; X86-NOSSE-NEXT:    popl %ecx
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: testmsxh:
+; X86-SSE2:       # %bb.0: # %entry
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
+; X86-SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; X86-SSE2-NEXT:    movw %ax, (%esp)
+; X86-SSE2-NEXT:    calll __extendhfsf2
+; X86-SSE2-NEXT:    fstps (%esp)
+; X86-SSE2-NEXT:    calll llrintf
+; X86-SSE2-NEXT:    popl %ecx
+; X86-SSE2-NEXT:    retl
+;
+; X64-SSE-LABEL: testmsxh:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    callq __extendhfsf2@PLT
+; X64-SSE-NEXT:    callq rintf@PLT
+; X64-SSE-NEXT:    callq __truncsfhf2@PLT
+; X64-SSE-NEXT:    callq __extendhfsf2@PLT
+; X64-SSE-NEXT:    cvttss2si %xmm0, %rax
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+entry:
+  %0 = tail call i64 @llvm.llrint.i64.f16(half %x)
+  ret i64 %0
+}
+
+define i64 @testmsxs(float %x) nounwind {
 ; X86-NOSSE-LABEL: testmsxs:
 ; X86-NOSSE:       # %bb.0: # %entry
 ; X86-NOSSE-NEXT:    pushl %ebp
-; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-NOSSE-NEXT:    .cfi_offset %ebp, -8
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
-; X86-NOSSE-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NOSSE-NEXT:    andl $-8, %esp
 ; X86-NOSSE-NEXT:    subl $8, %esp
 ; X86-NOSSE-NEXT:    flds 8(%ebp)
@@ -23,16 +59,12 @@ define i64 @testmsxs(float %x) {
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOSSE-NEXT:    movl %ebp, %esp
 ; X86-NOSSE-NEXT:    popl %ebp
-; X86-NOSSE-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NOSSE-NEXT:    retl
 ;
 ; X86-SSE2-LABEL: testmsxs:
 ; X86-SSE2:       # %bb.0: # %entry
 ; X86-SSE2-NEXT:    pushl %ebp
-; X86-SSE2-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE2-NEXT:    .cfi_offset %ebp, -8
 ; X86-SSE2-NEXT:    movl %esp, %ebp
-; X86-SSE2-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-SSE2-NEXT:    andl $-8, %esp
 ; X86-SSE2-NEXT:    subl $8, %esp
 ; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -43,16 +75,12 @@ define i64 @testmsxs(float %x) {
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
-; X86-SSE2-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX-LABEL: testmsxs:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    pushl %ebp
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX-NEXT:    .cfi_offset %ebp, -8
 ; X86-AVX-NEXT:    movl %esp, %ebp
-; X86-AVX-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-AVX-NEXT:    andl $-8, %esp
 ; X86-AVX-NEXT:    subl $8, %esp
 ; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -63,7 +91,6 @@ define i64 @testmsxs(float %x) {
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-AVX-NEXT:    movl %ebp, %esp
 ; X86-AVX-NEXT:    popl %ebp
-; X86-AVX-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: testmsxs:
@@ -76,18 +103,15 @@ define i64 @testmsxs(float %x) {
 ; X64-AVX-NEXT:    vcvtss2si %xmm0, %rax
 ; X64-AVX-NEXT:    retq
 entry:
-  %0 = tail call i64 @llvm.llrint.f32(float %x)
+  %0 = tail call i64 @llvm.llrint.i64.f32(float %x)
   ret i64 %0
 }
 
-define i64 @testmsxd(double %x) {
+define i64 @testmsxd(double %x) nounwind {
 ; X86-NOSSE-LABEL: testmsxd:
 ; X86-NOSSE:       # %bb.0: # %entry
 ; X86-NOSSE-NEXT:    pushl %ebp
-; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-NOSSE-NEXT:    .cfi_offset %ebp, -8
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
-; X86-NOSSE-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NOSSE-NEXT:    andl $-8, %esp
 ; X86-NOSSE-NEXT:    subl $8, %esp
 ; X86-NOSSE-NEXT:    fldl 8(%ebp)
@@ -96,16 +120,12 @@ define i64 @testmsxd(double %x) {
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOSSE-NEXT:    movl %ebp, %esp
 ; X86-NOSSE-NEXT:    popl %ebp
-; X86-NOSSE-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NOSSE-NEXT:    retl
 ;
 ; X86-SSE2-LABEL: testmsxd:
 ; X86-SSE2:       # %bb.0: # %entry
 ; X86-SSE2-NEXT:    pushl %ebp
-; X86-SSE2-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE2-NEXT:    .cfi_offset %ebp, -8
 ; X86-SSE2-NEXT:    movl %esp, %ebp
-; X86-SSE2-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-SSE2-NEXT:    andl $-8, %esp
 ; X86-SSE2-NEXT:    subl $8, %esp
 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
@@ -116,16 +136,12 @@ define i64 @testmsxd(double %x) {
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
-; X86-SSE2-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX-LABEL: testmsxd:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    pushl %ebp
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX-NEXT:    .cfi_offset %ebp, -8
 ; X86-AVX-NEXT:    movl %esp, %ebp
-; X86-AVX-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-AVX-NEXT:    andl $-8, %esp
 ; X86-AVX-NEXT:    subl $8, %esp
 ; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
@@ -136,7 +152,6 @@ define i64 @testmsxd(double %x) {
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-AVX-NEXT:    movl %ebp, %esp
 ; X86-AVX-NEXT:    popl %ebp
-; X86-AVX-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: testmsxd:
@@ -149,18 +164,15 @@ define i64 @testmsxd(double %x) {
 ; X64-AVX-NEXT:    vcvtsd2si %xmm0, %rax
 ; X64-AVX-NEXT:    retq
 entry:
-  %0 = tail call i64 @llvm.llrint.f64(double %x)
+  %0 = tail call i64 @llvm.llrint.i64.f64(double %x)
   ret i64 %0
 }
 
-define i64 @testmsll(x86_fp80 %x) {
+define i64 @testmsll(x86_fp80 %x) nounwind {
 ; X86-LABEL: testmsll:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    fldt 8(%ebp)
@@ -169,7 +181,6 @@ define i64 @testmsll(x86_fp80 %x) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
-; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: testmsll:
@@ -179,10 +190,65 @@ define i64 @testmsll(x86_fp80 %x) {
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = tail call i64 @llvm.llrint.f80(x86_fp80 %x)
+  %0 = tail call i64 @llvm.llrint.i64.f80(x86_fp80 %x)
+  ret i64 %0
+}
+
+; FIXME(#44744): incorrect libcall
+define i64 @testmslq(fp128 %x) nounwind {
+; X86-NOSSE-LABEL: testmslq:
+; X86-NOSSE:       # %bb.0: # %entry
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
+; X86-NOSSE-NEXT:    andl $-16, %esp
+; X86-NOSSE-NEXT:    subl $16, %esp
+; X86-NOSSE-NEXT:    pushl 20(%ebp)
+; X86-NOSSE-NEXT:    pushl 16(%ebp)
+; X86-NOSSE-NEXT:    pushl 12(%ebp)
+; X86-NOSSE-NEXT:    pushl 8(%ebp)
+; X86-NOSSE-NEXT:    calll llrintl
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    movl %ebp, %esp
+; X86-NOSSE-NEXT:    popl %ebp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: testmslq:
+; X86-SSE2:       # %bb.0: # %entry
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    pushl 20(%ebp)
+; X86-SSE2-NEXT:    pushl 16(%ebp)
+; X86-SSE2-NEXT:    pushl 12(%ebp)
+; X86-SSE2-NEXT:    pushl 8(%ebp)
+; X86-SSE2-NEXT:    calll llrintl
+; X86-SSE2-NEXT:    addl $16, %esp
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: testmslq:
+; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    pushl %ebp
+; X86-AVX-NEXT:    movl %esp, %ebp
+; X86-AVX-NEXT:    andl $-16, %esp
+; X86-AVX-NEXT:    subl $32, %esp
+; X86-AVX-NEXT:    vmovups 8(%ebp), %xmm0
+; X86-AVX-NEXT:    vmovups %xmm0, (%esp)
+; X86-AVX-NEXT:    calll llrintl
+; X86-AVX-NEXT:    movl %ebp, %esp
+; X86-AVX-NEXT:    popl %ebp
+; X86-AVX-NEXT:    retl
+;
+; X64-LABEL: testmslq:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    jmp llrintl@PLT # TAILCALL
+entry:
+  %0 = tail call i64 @llvm.llrint.i64.fp128(fp128 %x)
   ret i64 %0
 }
 
-declare i64 @llvm.llrint.f32(float) nounwind readnone
-declare i64 @llvm.llrint.f64(double) nounwind readnone
-declare i64 @llvm.llrint.f80(x86_fp80) nounwind readnone
+declare i64 @llvm.llrint.i64.f32(float) nounwind readnone
+declare i64 @llvm.llrint.i64.f64(double) nounwind readnone
+declare i64 @llvm.llrint.i64.f80(x86_fp80) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/lrint-conv-i32.ll b/llvm/test/CodeGen/X86/lrint-conv-i32.ll
index 21580f5..3c50aea1 100644
--- a/llvm/test/CodeGen/X86/lrint-conv-i32.ll
+++ b/llvm/test/CodeGen/X86/lrint-conv-i32.ll
@@ -7,16 +7,21 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefixes=X64,X64-AVX
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X64,X64-AVX
 
-define i32 @testmsws(float %x) {
+; FIXME: crash
+; define i32 @testmswh(half %x) nounwind {
+; entry:
+;   %0 = tail call i32 @llvm.lrint.i32.f16(half %x)
+;   ret i32 %0
+; }
+
+define i32 @testmsws(float %x) nounwind {
 ; X86-NOSSE-LABEL: testmsws:
 ; X86-NOSSE:       # %bb.0: # %entry
 ; X86-NOSSE-NEXT:    pushl %eax
-; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    fistpl (%esp)
 ; X86-NOSSE-NEXT:    movl (%esp), %eax
 ; X86-NOSSE-NEXT:    popl %ecx
-; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NOSSE-NEXT:    retl
 ;
 ; X86-SSE2-LABEL: testmsws:
@@ -43,16 +48,14 @@ entry:
   ret i32 %0
 }
 
-define i32 @testmswd(double %x) {
+define i32 @testmswd(double %x) nounwind {
 ; X86-NOSSE-LABEL: testmswd:
 ; X86-NOSSE:       # %bb.0: # %entry
 ; X86-NOSSE-NEXT:    pushl %eax
-; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    fistpl (%esp)
 ; X86-NOSSE-NEXT:    movl (%esp), %eax
 ; X86-NOSSE-NEXT:    popl %ecx
-; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NOSSE-NEXT:    retl
 ;
 ; X86-SSE2-LABEL: testmswd:
@@ -79,16 +82,14 @@ entry:
   ret i32 %0
 }
 
-define i32 @testmsll(x86_fp80 %x) {
+define i32 @testmsll(x86_fp80 %x) nounwind {
 ; X86-LABEL: testmsll:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fistpl (%esp)
 ; X86-NEXT:    movl (%esp), %eax
 ; X86-NEXT:    popl %ecx
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: testmsll:
@@ -102,6 +103,61 @@ entry:
   ret i32 %0
 }
 
+; FIXME(#44744): incorrect libcall
+define i32 @testmswq(fp128 %x) nounwind {
+; X86-NOSSE-LABEL: testmswq:
+; X86-NOSSE:       # %bb.0: # %entry
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
+; X86-NOSSE-NEXT:    andl $-16, %esp
+; X86-NOSSE-NEXT:    subl $16, %esp
+; X86-NOSSE-NEXT:    pushl 20(%ebp)
+; X86-NOSSE-NEXT:    pushl 16(%ebp)
+; X86-NOSSE-NEXT:    pushl 12(%ebp)
+; X86-NOSSE-NEXT:    pushl 8(%ebp)
+; X86-NOSSE-NEXT:    calll lrintl
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    movl %ebp, %esp
+; X86-NOSSE-NEXT:    popl %ebp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: testmswq:
+; X86-SSE2:       # %bb.0: # %entry
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    pushl 20(%ebp)
+; X86-SSE2-NEXT:    pushl 16(%ebp)
+; X86-SSE2-NEXT:    pushl 12(%ebp)
+; X86-SSE2-NEXT:    pushl 8(%ebp)
+; X86-SSE2-NEXT:    calll lrintl
+; X86-SSE2-NEXT:    addl $16, %esp
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: testmswq:
+; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    pushl %ebp
+; X86-AVX-NEXT:    movl %esp, %ebp
+; X86-AVX-NEXT:    andl $-16, %esp
+; X86-AVX-NEXT:    subl $32, %esp
+; X86-AVX-NEXT:    vmovups 8(%ebp), %xmm0
+; X86-AVX-NEXT:    vmovups %xmm0, (%esp)
+; X86-AVX-NEXT:    calll lrintl
+; X86-AVX-NEXT:    movl %ebp, %esp
+; X86-AVX-NEXT:    popl %ebp
+; X86-AVX-NEXT:    retl
+;
+; X64-LABEL: testmswq:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    jmp lrintl@PLT # TAILCALL
+entry:
+  %0 = tail call i32 @llvm.lrint.i32.f128(fp128 %x)
+  ret i32 %0
+}
+
 declare i32 @llvm.lrint.i32.f32(float) nounwind readnone
 declare i32 @llvm.lrint.i32.f64(double) nounwind readnone
 declare i32 @llvm.lrint.i32.f80(x86_fp80) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/lrint-conv-i64.ll b/llvm/test/CodeGen/X86/lrint-conv-i64.ll
index 38fa090..2ba1500 100644
--- a/llvm/test/CodeGen/X86/lrint-conv-i64.ll
+++ b/llvm/test/CodeGen/X86/lrint-conv-i64.ll
@@ -3,7 +3,23 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefixes=CHECK,AVX
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefixes=CHECK,AVX
 
-define i64 @testmsxs(float %x) {
+define i64 @testmsxh(half %x) nounwind {
+; SSE-LABEL: testmsxh:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushq %rax
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    callq rintf@PLT
+; SSE-NEXT:    callq __truncsfhf2@PLT
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    popq %rcx
+; SSE-NEXT:    retq
+entry:
+  %0 = tail call i64 @llvm.lrint.i64.f16(half %x)
+  ret i64 %0
+}
+
+define i64 @testmsxs(float %x) nounwind {
 ; SSE-LABEL: testmsxs:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    cvtss2si %xmm0, %rax
@@ -18,7 +34,7 @@ entry:
   ret i64 %0
 }
 
-define i64 @testmsxd(double %x) {
+define i64 @testmsxd(double %x) nounwind {
 ; SSE-LABEL: testmsxd:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    cvtsd2si %xmm0, %rax
@@ -33,7 +49,7 @@ entry:
   ret i64 %0
 }
 
-define i64 @testmsll(x86_fp80 %x) {
+define i64 @testmsll(x86_fp80 %x) nounwind {
 ; CHECK-LABEL: testmsll:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
@@ -45,7 +61,17 @@ entry:
   ret i64 %0
 }
 
-define i32 @PR125324(float %x) {
+; FIXME(#44744): incorrect libcall
+define i64 @testmsxq(fp128 %x) nounwind {
+; CHECK-LABEL: testmsxq:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    jmp lrintl@PLT # TAILCALL
+entry:
+  %0 = tail call i64 @llvm.lrint.i64.f128(fp128 %x)
+  ret i64 %0
+}
+
+define i32 @PR125324(float %x) nounwind {
 ; SSE-LABEL: PR125324:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    cvtss2si %xmm0, %rax
diff --git a/llvm/test/CodeGen/X86/merge-huge-sp-updates.ll b/llvm/test/CodeGen/X86/merge-huge-sp-updates.ll
index b26345e..6920e74 100644
--- a/llvm/test/CodeGen/X86/merge-huge-sp-updates.ll
+++ b/llvm/test/CodeGen/X86/merge-huge-sp-updates.ll
@@ -22,8 +22,8 @@ entry:
   call void @bar(i64 0, i64 0, i64 0, i64 0, i64 0, ptr null, ptr %rhs, ptr null, ptr %rhs)
 ; CHECK: call{{.*}}bar
 ; CHECK: addq{{.*}}$2147483647, %rsp
-; CHECK: addq{{.*}}$372037585, %rsp
-; CHECK: .cfi_adjust_cfa_offset -2519521232
+; CHECK: addq{{.*}}$372037601, %rsp
+; CHECK: .cfi_adjust_cfa_offset -2519521248
   ret void
 }
 
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
index fb2433d..7c9adaf 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
@@ -730,36 +730,36 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm3
-; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm5
 ; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubq %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm5, %xmm0
 ; AVX1-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
 ; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm6
 ; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm7
 ; AVX1-NEXT:    vpsrlq $33, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [1,1]
-; AVX1-NEXT:    vpor %xmm4, %xmm8, %xmm9
+; AVX1-NEXT:    vpor %xmm5, %xmm8, %xmm9
 ; AVX1-NEXT:    vpmuludq %xmm0, %xmm9, %xmm0
-; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm4
-; AVX1-NEXT:    vpmuludq %xmm4, %xmm7, %xmm4
-; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm5
+; AVX1-NEXT:    vpmuludq %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vpaddq %xmm0, %xmm5, %xmm0
 ; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
-; AVX1-NEXT:    vpmuludq %xmm7, %xmm9, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm7, %xmm9, %xmm5
 ; AVX1-NEXT:    vpsrlq $33, %xmm1, %xmm1
-; AVX1-NEXT:    vpor %xmm5, %xmm8, %xmm7
+; AVX1-NEXT:    vpor %xmm4, %xmm8, %xmm7
 ; AVX1-NEXT:    vpmuludq %xmm7, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm5
-; AVX1-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vpaddq %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
-; AVX1-NEXT:    vpmuludq %xmm7, %xmm6, %xmm5
-; AVX1-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpmuludq %xmm7, %xmm6, %xmm4
+; AVX1-NEXT:    vpaddq %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -767,20 +767,20 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
 ; AVX2-LABEL: vec256_i64_signed_mem_reg:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
-; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm3
-; AVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
+; AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm3
 ; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpxor %ymm3, %ymm0, %ymm0
-; AVX2-NEXT:    vpsubq %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm4
 ; AVX2-NEXT:    vpsrlq $33, %ymm0, %ymm0
-; AVX2-NEXT:    vpmuludq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpsrlq $32, %ymm3, %ymm3
-; AVX2-NEXT:    vpmuludq %ymm3, %ymm4, %ymm3
-; AVX2-NEXT:    vpaddq %ymm0, %ymm3, %ymm0
-; AVX2-NEXT:    vpsllq $32, %ymm0, %ymm0
+; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlq $32, %ymm2, %ymm2
 ; AVX2-NEXT:    vpmuludq %ymm2, %ymm4, %ymm2
+; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpsllq $32, %ymm0, %ymm0
+; AVX2-NEXT:    vpmuludq %ymm3, %ymm4, %ymm2
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
@@ -790,36 +790,36 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
 ; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; XOP-NEXT:    vmovdqa (%rdi), %xmm2
 ; XOP-NEXT:    vmovdqa 16(%rdi), %xmm3
-; XOP-NEXT:    vpcomgtq %xmm0, %xmm2, %xmm4
+; XOP-NEXT:    vpcomgtq %xmm1, %xmm3, %xmm4
+; XOP-NEXT:    vpcomgtq %xmm0, %xmm2, %xmm5
 ; XOP-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
-; XOP-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; XOP-NEXT:    vpsubq %xmm0, %xmm4, %xmm0
-; XOP-NEXT:    vpcomgtq %xmm1, %xmm3, %xmm5
+; XOP-NEXT:    vpxor %xmm5, %xmm0, %xmm0
+; XOP-NEXT:    vpsubq %xmm0, %xmm5, %xmm0
 ; XOP-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
-; XOP-NEXT:    vpxor %xmm5, %xmm1, %xmm1
-; XOP-NEXT:    vpsubq %xmm1, %xmm5, %xmm1
+; XOP-NEXT:    vpxor %xmm4, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
 ; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm6
 ; XOP-NEXT:    vpsrlq $1, %xmm0, %xmm7
 ; XOP-NEXT:    vpsrlq $33, %xmm0, %xmm0
 ; XOP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [1,1]
-; XOP-NEXT:    vpor %xmm4, %xmm8, %xmm9
+; XOP-NEXT:    vpor %xmm5, %xmm8, %xmm9
 ; XOP-NEXT:    vpmuludq %xmm0, %xmm9, %xmm0
-; XOP-NEXT:    vpsrlq $32, %xmm4, %xmm4
-; XOP-NEXT:    vpmuludq %xmm4, %xmm7, %xmm4
-; XOP-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; XOP-NEXT:    vpsrlq $32, %xmm5, %xmm5
+; XOP-NEXT:    vpmuludq %xmm5, %xmm7, %xmm5
+; XOP-NEXT:    vpaddq %xmm0, %xmm5, %xmm0
 ; XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
-; XOP-NEXT:    vpmuludq %xmm7, %xmm9, %xmm4
+; XOP-NEXT:    vpmuludq %xmm7, %xmm9, %xmm5
 ; XOP-NEXT:    vpsrlq $33, %xmm1, %xmm1
-; XOP-NEXT:    vpor %xmm5, %xmm8, %xmm7
+; XOP-NEXT:    vpor %xmm4, %xmm8, %xmm7
 ; XOP-NEXT:    vpmuludq %xmm7, %xmm1, %xmm1
-; XOP-NEXT:    vpsrlq $32, %xmm5, %xmm5
-; XOP-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
-; XOP-NEXT:    vpaddq %xmm1, %xmm5, %xmm1
+; XOP-NEXT:    vpsrlq $32, %xmm4, %xmm4
+; XOP-NEXT:    vpmuludq %xmm4, %xmm6, %xmm4
+; XOP-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; XOP-NEXT:    vpsllq $32, %xmm1, %xmm1
-; XOP-NEXT:    vpmuludq %xmm7, %xmm6, %xmm5
-; XOP-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; XOP-NEXT:    vpmuludq %xmm7, %xmm6, %xmm4
+; XOP-NEXT:    vpaddq %xmm3, %xmm4, %xmm3
 ; XOP-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
-; XOP-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
+; XOP-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
 ; XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; XOP-NEXT:    retq
@@ -900,36 +900,36 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm3
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm5
 ; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm3
-; AVX1-NEXT:    vpxor %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubq %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsrlq $1, %xmm3, %xmm6
 ; AVX1-NEXT:    vpsrlq $1, %xmm2, %xmm7
 ; AVX1-NEXT:    vpsrlq $33, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [1,1]
-; AVX1-NEXT:    vpor %xmm4, %xmm8, %xmm9
+; AVX1-NEXT:    vpor %xmm5, %xmm8, %xmm9
 ; AVX1-NEXT:    vpmuludq %xmm2, %xmm9, %xmm2
-; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm4
-; AVX1-NEXT:    vpmuludq %xmm4, %xmm7, %xmm4
-; AVX1-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm5
+; AVX1-NEXT:    vpmuludq %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
-; AVX1-NEXT:    vpmuludq %xmm7, %xmm9, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm7, %xmm9, %xmm5
 ; AVX1-NEXT:    vpsrlq $33, %xmm3, %xmm3
-; AVX1-NEXT:    vpor %xmm5, %xmm8, %xmm7
+; AVX1-NEXT:    vpor %xmm4, %xmm8, %xmm7
 ; AVX1-NEXT:    vpmuludq %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm5
-; AVX1-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vpaddq %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsllq $32, %xmm3, %xmm3
-; AVX1-NEXT:    vpmuludq %xmm7, %xmm6, %xmm5
-; AVX1-NEXT:    vpaddq %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpmuludq %xmm7, %xmm6, %xmm4
+; AVX1-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    vpaddq %xmm0, %xmm5, %xmm0
 ; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -937,20 +937,20 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
 ; AVX2-LABEL: vec256_i64_signed_reg_mem:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
-; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm3
-; AVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
+; AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm3
 ; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpsubq %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm4
 ; AVX2-NEXT:    vpsrlq $33, %ymm1, %ymm1
-; AVX2-NEXT:    vpmuludq %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpsrlq $32, %ymm3, %ymm3
-; AVX2-NEXT:    vpmuludq %ymm3, %ymm4, %ymm3
-; AVX2-NEXT:    vpaddq %ymm1, %ymm3, %ymm1
-; AVX2-NEXT:    vpsllq $32, %ymm1, %ymm1
+; AVX2-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlq $32, %ymm2, %ymm2
 ; AVX2-NEXT:    vpmuludq %ymm2, %ymm4, %ymm2
+; AVX2-NEXT:    vpaddq %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpsllq $32, %ymm1, %ymm1
+; AVX2-NEXT:    vpmuludq %ymm3, %ymm4, %ymm2
 ; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
@@ -960,36 +960,36 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
 ; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; XOP-NEXT:    vmovdqa (%rdi), %xmm2
 ; XOP-NEXT:    vmovdqa 16(%rdi), %xmm3
-; XOP-NEXT:    vpcomgtq %xmm2, %xmm0, %xmm4
+; XOP-NEXT:    vpcomgtq %xmm3, %xmm1, %xmm4
+; XOP-NEXT:    vpcomgtq %xmm2, %xmm0, %xmm5
 ; XOP-NEXT:    vpsubq %xmm2, %xmm0, %xmm2
-; XOP-NEXT:    vpxor %xmm4, %xmm2, %xmm2
-; XOP-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
-; XOP-NEXT:    vpcomgtq %xmm3, %xmm1, %xmm5
+; XOP-NEXT:    vpxor %xmm5, %xmm2, %xmm2
+; XOP-NEXT:    vpsubq %xmm2, %xmm5, %xmm2
 ; XOP-NEXT:    vpsubq %xmm3, %xmm1, %xmm3
-; XOP-NEXT:    vpxor %xmm5, %xmm3, %xmm3
-; XOP-NEXT:    vpsubq %xmm3, %xmm5, %xmm3
+; XOP-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; XOP-NEXT:    vpsubq %xmm3, %xmm4, %xmm3
 ; XOP-NEXT:    vpsrlq $1, %xmm3, %xmm6
 ; XOP-NEXT:    vpsrlq $1, %xmm2, %xmm7
 ; XOP-NEXT:    vpsrlq $33, %xmm2, %xmm2
 ; XOP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [1,1]
-; XOP-NEXT:    vpor %xmm4, %xmm8, %xmm9
+; XOP-NEXT:    vpor %xmm5, %xmm8, %xmm9
 ; XOP-NEXT:    vpmuludq %xmm2, %xmm9, %xmm2
-; XOP-NEXT:    vpsrlq $32, %xmm4, %xmm4
-; XOP-NEXT:    vpmuludq %xmm4, %xmm7, %xmm4
-; XOP-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
+; XOP-NEXT:    vpsrlq $32, %xmm5, %xmm5
+; XOP-NEXT:    vpmuludq %xmm5, %xmm7, %xmm5
+; XOP-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
 ; XOP-NEXT:    vpsllq $32, %xmm2, %xmm2
-; XOP-NEXT:    vpmuludq %xmm7, %xmm9, %xmm4
+; XOP-NEXT:    vpmuludq %xmm7, %xmm9, %xmm5
 ; XOP-NEXT:    vpsrlq $33, %xmm3, %xmm3
-; XOP-NEXT:    vpor %xmm5, %xmm8, %xmm7
+; XOP-NEXT:    vpor %xmm4, %xmm8, %xmm7
 ; XOP-NEXT:    vpmuludq %xmm7, %xmm3, %xmm3
-; XOP-NEXT:    vpsrlq $32, %xmm5, %xmm5
-; XOP-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
-; XOP-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; XOP-NEXT:    vpsrlq $32, %xmm4, %xmm4
+; XOP-NEXT:    vpmuludq %xmm4, %xmm6, %xmm4
+; XOP-NEXT:    vpaddq %xmm3, %xmm4, %xmm3
 ; XOP-NEXT:    vpsllq $32, %xmm3, %xmm3
-; XOP-NEXT:    vpmuludq %xmm7, %xmm6, %xmm5
-; XOP-NEXT:    vpaddq %xmm1, %xmm5, %xmm1
+; XOP-NEXT:    vpmuludq %xmm7, %xmm6, %xmm4
+; XOP-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; XOP-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
-; XOP-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; XOP-NEXT:    vpaddq %xmm0, %xmm5, %xmm0
 ; XOP-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; XOP-NEXT:    retq
@@ -1071,36 +1071,36 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX1-NEXT:    vmovdqa 16(%rsi), %xmm1
 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm3
-; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm5
 ; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubq %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm5, %xmm0
 ; AVX1-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
 ; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm6
 ; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm7
 ; AVX1-NEXT:    vpsrlq $33, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [1,1]
-; AVX1-NEXT:    vpor %xmm4, %xmm8, %xmm9
+; AVX1-NEXT:    vpor %xmm5, %xmm8, %xmm9
 ; AVX1-NEXT:    vpmuludq %xmm0, %xmm9, %xmm0
-; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm4
-; AVX1-NEXT:    vpmuludq %xmm4, %xmm7, %xmm4
-; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm5
+; AVX1-NEXT:    vpmuludq %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vpaddq %xmm0, %xmm5, %xmm0
 ; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
-; AVX1-NEXT:    vpmuludq %xmm7, %xmm9, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm7, %xmm9, %xmm5
 ; AVX1-NEXT:    vpsrlq $33, %xmm1, %xmm1
-; AVX1-NEXT:    vpor %xmm5, %xmm8, %xmm7
+; AVX1-NEXT:    vpor %xmm4, %xmm8, %xmm7
 ; AVX1-NEXT:    vpmuludq %xmm7, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm5
-; AVX1-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vpaddq %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
-; AVX1-NEXT:    vpmuludq %xmm7, %xmm6, %xmm5
-; AVX1-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpmuludq %xmm7, %xmm6, %xmm4
+; AVX1-NEXT:    vpaddq %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -1109,20 +1109,20 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vmovdqa (%rsi), %ymm1
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
-; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm3
-; AVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
+; AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm3
 ; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpsubq %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm4
 ; AVX2-NEXT:    vpsrlq $33, %ymm1, %ymm1
-; AVX2-NEXT:    vpmuludq %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpsrlq $32, %ymm3, %ymm3
-; AVX2-NEXT:    vpmuludq %ymm3, %ymm4, %ymm3
-; AVX2-NEXT:    vpaddq %ymm1, %ymm3, %ymm1
-; AVX2-NEXT:    vpsllq $32, %ymm1, %ymm1
+; AVX2-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlq $32, %ymm2, %ymm2
 ; AVX2-NEXT:    vpmuludq %ymm2, %ymm4, %ymm2
+; AVX2-NEXT:    vpaddq %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpsllq $32, %ymm1, %ymm1
+; AVX2-NEXT:    vpmuludq %ymm3, %ymm4, %ymm2
 ; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
@@ -1133,36 +1133,36 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; XOP-NEXT:    vmovdqa 16(%rsi), %xmm1
 ; XOP-NEXT:    vmovdqa (%rdi), %xmm2
 ; XOP-NEXT:    vmovdqa 16(%rdi), %xmm3
-; XOP-NEXT:    vpcomgtq %xmm0, %xmm2, %xmm4
+; XOP-NEXT:    vpcomgtq %xmm1, %xmm3, %xmm4
+; XOP-NEXT:    vpcomgtq %xmm0, %xmm2, %xmm5
 ; XOP-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
-; XOP-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; XOP-NEXT:    vpsubq %xmm0, %xmm4, %xmm0
-; XOP-NEXT:    vpcomgtq %xmm1, %xmm3, %xmm5
+; XOP-NEXT:    vpxor %xmm5, %xmm0, %xmm0
+; XOP-NEXT:    vpsubq %xmm0, %xmm5, %xmm0
 ; XOP-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
-; XOP-NEXT:    vpxor %xmm5, %xmm1, %xmm1
-; XOP-NEXT:    vpsubq %xmm1, %xmm5, %xmm1
+; XOP-NEXT:    vpxor %xmm4, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
 ; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm6
 ; XOP-NEXT:    vpsrlq $1, %xmm0, %xmm7
 ; XOP-NEXT:    vpsrlq $33, %xmm0, %xmm0
 ; XOP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [1,1]
-; XOP-NEXT:    vpor %xmm4, %xmm8, %xmm9
+; XOP-NEXT:    vpor %xmm5, %xmm8, %xmm9
 ; XOP-NEXT:    vpmuludq %xmm0, %xmm9, %xmm0
-; XOP-NEXT:    vpsrlq $32, %xmm4, %xmm4
-; XOP-NEXT:    vpmuludq %xmm4, %xmm7, %xmm4
-; XOP-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; XOP-NEXT:    vpsrlq $32, %xmm5, %xmm5
+; XOP-NEXT:    vpmuludq %xmm5, %xmm7, %xmm5
+; XOP-NEXT:    vpaddq %xmm0, %xmm5, %xmm0
 ; XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
-; XOP-NEXT:    vpmuludq %xmm7, %xmm9, %xmm4
+; XOP-NEXT:    vpmuludq %xmm7, %xmm9, %xmm5
 ; XOP-NEXT:    vpsrlq $33, %xmm1, %xmm1
-; XOP-NEXT:    vpor %xmm5, %xmm8, %xmm7
+; XOP-NEXT:    vpor %xmm4, %xmm8, %xmm7
 ; XOP-NEXT:    vpmuludq %xmm7, %xmm1, %xmm1
-; XOP-NEXT:    vpsrlq $32, %xmm5, %xmm5
-; XOP-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
-; XOP-NEXT:    vpaddq %xmm1, %xmm5, %xmm1
+; XOP-NEXT:    vpsrlq $32, %xmm4, %xmm4
+; XOP-NEXT:    vpmuludq %xmm4, %xmm6, %xmm4
+; XOP-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; XOP-NEXT:    vpsllq $32, %xmm1, %xmm1
-; XOP-NEXT:    vpmuludq %xmm7, %xmm6, %xmm5
-; XOP-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; XOP-NEXT:    vpmuludq %xmm7, %xmm6, %xmm4
+; XOP-NEXT:    vpaddq %xmm3, %xmm4, %xmm3
 ; XOP-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
-; XOP-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
+; XOP-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
 ; XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; XOP-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/movtopush.mir b/llvm/test/CodeGen/X86/movtopush.mir
index f92c385..075b3c3 100644
--- a/llvm/test/CodeGen/X86/movtopush.mir
+++ b/llvm/test/CodeGen/X86/movtopush.mir
@@ -83,8 +83,8 @@ frameInfo:
   hasOpaqueSPAdjustment: false
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:
 stack:
   - { id: 0, name: p, type: default, offset: 0, size: 4, alignment: 4,
diff --git a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
index 9e39809..693d199 100644
--- a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
+++ b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
@@ -93,10 +93,8 @@ define <4 x i1> @p4_vector_urem_by_const__splat(<4 x i32> %x, <4 x i32> %y) {
 ; SSE2-NEXT:    psrld $1, %xmm0
 ; SSE2-NEXT:    pslld $31, %xmm3
 ; SSE2-NEXT:    por %xmm0, %xmm3
-; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE2-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [715827883,715827883,715827883,715827883]
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: p4_vector_urem_by_const__splat:
@@ -104,9 +102,9 @@ define <4 x i1> @p4_vector_urem_by_const__splat(<4 x i32> %x, <4 x i32> %y) {
 ; SSE4-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE4-NEXT:    psrld $1, %xmm0
-; SSE4-NEXT:    movdqa {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882]
-; SSE4-NEXT:    pminud %xmm0, %xmm1
-; SSE4-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE4-NEXT:    movdqa {{.*#+}} xmm1 = [715827883,715827883,715827883,715827883]
+; SSE4-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE4-NEXT:    movdqa %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX2-LABEL: p4_vector_urem_by_const__splat:
@@ -116,9 +114,8 @@ define <4 x i1> @p4_vector_urem_by_const__splat(<4 x i32> %x, <4 x i32> %y) {
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
 ; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpsrld $1, %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882]
-; AVX2-NEXT:    vpminud %xmm1, %xmm0, %xmm1
-; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [715827883,715827883,715827883,715827883]
+; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
   %t0 = and <4 x i32> %x, <i32 128, i32 128, i32 128, i32 128> ; clearly a power-of-two or zero
   %t1 = urem <4 x i32> %t0, <i32 6, i32 6, i32 6, i32 6> ; '6' is clearly not a power of two
diff --git a/llvm/test/CodeGen/X86/peep-test-5.ll b/llvm/test/CodeGen/X86/peep-test-5.ll
index 52bcbe9..a4af93b 100644
--- a/llvm/test/CodeGen/X86/peep-test-5.ll
+++ b/llvm/test/CodeGen/X86/peep-test-5.ll
@@ -51,3 +51,54 @@ end:
 }
 
 declare void @free_object()
+
+; Check TEST instruction would not be combined with CMP.
+define i1 @pr155586(i8 %0) {
+; CHECK-LABEL: pr155586:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $1, %dil
+; CHECK-NEXT:    setne %cl
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    andb %cl, %al
+; CHECK-NEXT:    retq
+entry:
+  %cmp88.not = icmp eq i8 %0, 1
+  %1 = and i8 %0, 1
+  %tobool161.not = icmp eq i8 %1, 0
+  %common.ret.op = select i1 %cmp88.not, i1 false, i1 %tobool161.not
+  ret i1 %common.ret.op
+}
+
+; Check TEST8rr instruction would not be combined with TEST8ri.
+define i32 @pr155828() {
+; CHECK-LABEL: pr155828:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB2_1: # %func_188.exit.i.i
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    testb $1, %cl
+; CHECK-NEXT:    jne .LBB2_1
+; CHECK-NEXT:  # %bb.2: # %if.else.i.i.i
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    setg %al
+; CHECK-NEXT:    retq
+entry:
+  br label %func_188.exit.i.i
+
+func_188.exit.i.i:                                ; preds = %func_188.exit.i.i, %entry
+  %or659.i167180.i.i = phi i32 [ 0, %entry ], [ 1, %func_188.exit.i.i ]
+  %conv48.i.i = trunc i32 %or659.i167180.i.i to i8
+  %and.i.i.i = and i32 %or659.i167180.i.i, 1
+  %tobool80.not.i.i.i = icmp eq i32 %and.i.i.i, 0
+  br i1 %tobool80.not.i.i.i, label %if.else.i.i.i, label %func_188.exit.i.i
+
+if.else.i.i.i:                                    ; preds = %func_188.exit.i.i
+  %cmp183.i.i.i = icmp sgt i8 %conv48.i.i, 0
+  %ext = zext i1 %cmp183.i.i.i to i32
+  ret i32 %ext
+}
diff --git a/llvm/test/CodeGen/X86/peephole-test-after-add.mir b/llvm/test/CodeGen/X86/peephole-test-after-add.mir
index aeb9988..675c8f4 100644
--- a/llvm/test/CodeGen/X86/peephole-test-after-add.mir
+++ b/llvm/test/CodeGen/X86/peephole-test-after-add.mir
@@ -293,8 +293,8 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   hasTailCall:     false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:           []
 entry_values:    []
@@ -418,8 +418,8 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   hasTailCall:     false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:           []
 entry_values:    []
@@ -544,8 +544,8 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   hasTailCall:     false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:           []
 entry_values:    []
diff --git a/llvm/test/CodeGen/X86/pr140491-sincos-lifetimes.ll b/llvm/test/CodeGen/X86/pr140491-sincos-lifetimes.ll
index 2ca99bd..58dfd63 100644
--- a/llvm/test/CodeGen/X86/pr140491-sincos-lifetimes.ll
+++ b/llvm/test/CodeGen/X86/pr140491-sincos-lifetimes.ll
@@ -51,20 +51,20 @@ entry:
   %sincos = tail call { float, float } @llvm.sincos.f32(float %in)
   %sin = extractvalue { float, float } %sincos, 0
   %cos = extractvalue { float, float } %sincos, 1
-  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %computed)
+  call void @llvm.lifetime.start.p0(ptr nonnull %computed)
   store float %cos, ptr %computed, align 4
   call void @use_ptr(ptr nonnull %computed)
-  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %computed)
-  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %computed1)
+  call void @llvm.lifetime.end.p0(ptr nonnull %computed)
+  call void @llvm.lifetime.start.p0(ptr nonnull %computed1)
   %fneg_sin = fneg float %sin
   store float %fneg_sin, ptr %computed1, align 4
   call void @use_ptr(ptr nonnull %computed1)
-  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %computed1)
-  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %computed3)
+  call void @llvm.lifetime.end.p0(ptr nonnull %computed1)
+  call void @llvm.lifetime.start.p0(ptr nonnull %computed3)
   %fneg_cos = fneg float %cos
   store float %fneg_cos, ptr %computed3, align 4
   call void @use_ptr(ptr nonnull %computed3)
-  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %computed3)
+  call void @llvm.lifetime.end.p0(ptr nonnull %computed3)
   ret i32 0
 }
 
diff --git a/llvm/test/CodeGen/X86/pr152150.ll b/llvm/test/CodeGen/X86/pr152150.ll
new file mode 100644
index 0000000..6db3e55
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr152150.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown-eabi-elf | FileCheck %s
+
+; CHECK-LABEL: conv2d
+define dso_local void @conv2d() {
+.preheader:
+  br label %0
+
+0:                                                ; preds = %0, %.preheader
+  %1 = phi [4 x <7 x half>] [ zeroinitializer, %.preheader ], [ %4, %0 ]
+  %2 = extractvalue [4 x <7 x half>] %1, 0
+  %3 = extractvalue [4 x <7 x half>] %1, 1
+  %4 = insertvalue [4 x <7 x half>] poison, <7 x half> poison, 3
+  br label %0
+}
diff --git a/llvm/test/CodeGen/X86/pr152630.ll b/llvm/test/CodeGen/X86/pr152630.ll
new file mode 100644
index 0000000..8fa9883
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr152630.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+define i32 @pr152630(i1 %cond) nounwind {
+; CHECK-LABEL: pr152630:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    decl %edi
+; CHECK-NEXT:    cmpl $-1, %edi
+; CHECK-NEXT:    je .LBB0_2
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    jne .LBB0_3
+; CHECK-NEXT:  .LBB0_2: # %if.then
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB0_3: # %if.else
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retq
+entry:
+  %sel = select i1 %cond, i32 0, i32 -1
+  %conv = trunc nsw i32 %sel to i8
+  switch i8 %conv, label %if.else [
+    i8 -1, label %if.then
+    i8 0, label %if.then
+  ]
+
+if.then:
+  ret i32 0
+
+if.else:
+  ret i32 1
+}
diff --git a/llvm/test/CodeGen/X86/pr154492.ll b/llvm/test/CodeGen/X86/pr154492.ll
new file mode 100644
index 0000000..1ba1759
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr154492.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f  | FileCheck %s --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VL
+
+define <16 x i32> @PR154492() {
+; AVX512F-LABEL: PR154492:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT:    vcvttps2udq %zmm0, %zmm0
+; AVX512F-NEXT:    vmovaps %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: PR154492:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vcvttps2udq %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> zeroinitializer, <16 x i32> zeroinitializer, i16 255, i32 4)
+  ret <16 x i32> %res
+}
diff --git a/llvm/test/CodeGen/X86/pr30821.mir b/llvm/test/CodeGen/X86/pr30821.mir
index dccb492..40761e0 100644
--- a/llvm/test/CodeGen/X86/pr30821.mir
+++ b/llvm/test/CodeGen/X86/pr30821.mir
@@ -42,8 +42,8 @@ frameInfo:
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:
 stack:
   - { id: 0, name: alpha, type: default, offset: 0, size: 1, alignment: 1,
diff --git a/llvm/test/CodeGen/X86/pr33010.ll b/llvm/test/CodeGen/X86/pr33010.ll
index 6f0ce66..41e44db 100644
--- a/llvm/test/CodeGen/X86/pr33010.ll
+++ b/llvm/test/CodeGen/X86/pr33010.ll
@@ -19,13 +19,10 @@ define ptr addrspace(1) @test(ptr addrspace(1) %a, ptr addrspace(1) %b, i1 %whic
 ; CHECK-NEXT:    callq f@PLT
 ; CHECK-NEXT:  .Ltmp0:
 ; CHECK-NEXT:    testb $1, %bl
-; CHECK-NEXT:    je .LBB0_1
-; CHECK-NEXT:  # %bb.2: # %entry
-; CHECK-NEXT:    movq (%rsp), %rax
-; CHECK-NEXT:    jmp .LBB0_3
-; CHECK-NEXT:  .LBB0_1:
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:  .LBB0_3: # %entry
+; CHECK-NEXT:    movq %rsp, %rax
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT:    cmovneq %rax, %rcx
+; CHECK-NEXT:    movq (%rcx), %rax
 ; CHECK-NEXT:    addq $16, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    popq %rbx
diff --git a/llvm/test/CodeGen/X86/pr38795.ll b/llvm/test/CodeGen/X86/pr38795.ll
index c3c96e8..6a0c135 100644
--- a/llvm/test/CodeGen/X86/pr38795.ll
+++ b/llvm/test/CodeGen/X86/pr38795.ll
@@ -260,7 +260,6 @@ define void @verifier_error_reduced_issue38788(i1 %cmp11) {
 ; CHECK-NEXT:    pushl %ebx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    .cfi_offset %ebx, -8
-; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    xorl %ebx, %ebx
 ; CHECK-NEXT:    jmp .LBB1_1
@@ -272,10 +271,9 @@ define void @verifier_error_reduced_issue38788(i1 %cmp11) {
 ; CHECK-NEXT:    # in Loop: Header=BB1_1 Depth=1
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    movl %edx, %ebx
-; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:  .LBB1_1: # %for.cond
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    je .LBB1_3
 ; CHECK-NEXT:  # %bb.2: # in Loop: Header=BB1_1 Depth=1
 ; CHECK-NEXT:    xorl %eax, %eax
@@ -283,12 +281,11 @@ define void @verifier_error_reduced_issue38788(i1 %cmp11) {
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB1_3: # %if.end
 ; CHECK-NEXT:    # in Loop: Header=BB1_1 Depth=1
-; CHECK-NEXT:    testb $1, %al
 ; CHECK-NEXT:    je .LBB1_4
 ; CHECK-NEXT:  # %bb.9: # %if.then13
 ; CHECK-NEXT:    # in Loop: Header=BB1_1 Depth=1
 ; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %ebx, %eax
 ; CHECK-NEXT:    movl $0, %ebx
 ; CHECK-NEXT:    jne .LBB1_8
diff --git a/llvm/test/CodeGen/X86/pr38952.mir b/llvm/test/CodeGen/X86/pr38952.mir
index 1d18738..e6fab07 100644
--- a/llvm/test/CodeGen/X86/pr38952.mir
+++ b/llvm/test/CodeGen/X86/pr38952.mir
@@ -60,8 +60,8 @@ frameInfo:
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:
 stack:
 constants:
diff --git a/llvm/test/CodeGen/X86/pr48064.mir b/llvm/test/CodeGen/X86/pr48064.mir
index 9712a3ca..eb74edd 100644
--- a/llvm/test/CodeGen/X86/pr48064.mir
+++ b/llvm/test/CodeGen/X86/pr48064.mir
@@ -236,8 +236,8 @@ frameInfo:
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:
   - { id: 0, type: default, offset: 0, size: 4, alignment: 4, stack-id: default,
       isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
@@ -292,8 +292,8 @@ frameInfo:
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:
   - { id: 0, name: zx, type: default, offset: 0, size: 16, alignment: 4,
@@ -403,8 +403,8 @@ frameInfo:
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:
   - { id: 0, type: default, offset: 0, size: 4, alignment: 4, stack-id: default,
       isImmutable: false, isAliased: false, callee-saved-register: '',
diff --git a/llvm/test/CodeGen/X86/pr62286.ll b/llvm/test/CodeGen/X86/pr62286.ll
index 9728e13..ce03f8f 100644
--- a/llvm/test/CodeGen/X86/pr62286.ll
+++ b/llvm/test/CodeGen/X86/pr62286.ll
@@ -28,8 +28,9 @@ define i64 @PR62286(i32 %a) {
 ; AVX1-NEXT:    vmovd %edi, %xmm0
 ; AVX1-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
 ; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
 ; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
@@ -42,10 +43,10 @@ define i64 @PR62286(i32 %a) {
 ; AVX2-LABEL: PR62286:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovd %edi, %xmm0
-; AVX2-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
-; AVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm1
+; AVX2-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
@@ -58,12 +59,13 @@ define i64 @PR62286(i32 %a) {
 ; AVX512-LABEL: PR62286:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovd %edi, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
-; AVX512-NEXT:    movw $4369, %ax # imm = 0x1111
+; AVX512-NEXT:    movb $8, %al
 ; AVX512-NEXT:    kmovd %eax, %k1
-; AVX512-NEXT:    vpaddd %zmm0, %zmm0, %zmm1 {%k1}
-; AVX512-NEXT:    vpmovsxdq %ymm1, %zmm0
+; AVX512-NEXT:    vpexpandd %ymm0, %ymm1 {%k1} {z}
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX512-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/pr67333.ll b/llvm/test/CodeGen/X86/pr67333.ll
index 9463809..cbb730857 100644
--- a/llvm/test/CodeGen/X86/pr67333.ll
+++ b/llvm/test/CodeGen/X86/pr67333.ll
@@ -14,12 +14,12 @@ define void @SHA256_Compress_Generic(ptr noundef %ctx) #1 {
 ; CHECK-NEXT:    vpshufb %xmm1, %xmm0, %xmm2
 ; CHECK-NEXT:    vpsrld $17, %xmm2, %xmm0
 ; CHECK-NEXT:    vpslld $15, %xmm2, %xmm3
-; CHECK-NEXT:    vpor %xmm0, %xmm3, %xmm0
-; CHECK-NEXT:    vpsrld $19, %xmm2, %xmm3
+; CHECK-NEXT:    vpor %xmm0, %xmm3, %xmm3
+; CHECK-NEXT:    vpsrld $19, %xmm2, %xmm0
 ; CHECK-NEXT:    vpslld $13, %xmm2, %xmm4
-; CHECK-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; CHECK-NEXT:    vpxor %xmm3, %xmm0, %xmm3
-; CHECK-NEXT:    vpxor %xmm2, %xmm3, %xmm0
+; CHECK-NEXT:    vpor %xmm0, %xmm4, %xmm0
+; CHECK-NEXT:    vpxor %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovd %ecx, %xmm4
 ; CHECK-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/ptrtoaddr.ll b/llvm/test/CodeGen/X86/ptrtoaddr.ll
new file mode 100644
index 0000000..24bf9db
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ptrtoaddr.ll
@@ -0,0 +1,113 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu < %s -o - | FileCheck %s --check-prefix=CHECK
+
+define i1 @ptrtoaddr_1(ptr %p) {
+; CHECK-LABEL: ptrtoaddr_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    xorb $1, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $rax
+; CHECK-NEXT:    retq
+entry:
+  %addr = ptrtoaddr ptr %p to i64
+  %trunc = trunc i64 %addr to i1
+  %ret = xor i1 %trunc, 1
+  ret i1 %ret
+}
+
+define i8 @ptrtoaddr_8(ptr %p) {
+; CHECK-LABEL: ptrtoaddr_8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    notb %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $rax
+; CHECK-NEXT:    retq
+entry:
+  %addr = ptrtoaddr ptr %p to i64
+  %trunc = trunc i64 %addr to i8
+  %ret = xor i8 %trunc, -1
+  ret i8 %ret
+}
+
+define i16 @ptrtoaddr_16(ptr %p) {
+; CHECK-LABEL: ptrtoaddr_16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    notl %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $rax
+; CHECK-NEXT:    retq
+entry:
+  %addr = ptrtoaddr ptr %p to i64
+  %trunc = trunc i64 %addr to i16
+  %ret = xor i16 %trunc, -1
+  ret i16 %ret
+}
+
+define i32 @ptrtoaddr_32(ptr %p) {
+; CHECK-LABEL: ptrtoaddr_32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    notl %eax
+; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
+; CHECK-NEXT:    retq
+entry:
+  %addr = ptrtoaddr ptr %p to i64
+  %trunc = trunc i64 %addr to i32
+  %ret = xor i32 %trunc, -1
+  ret i32 %ret
+}
+
+define i64 @ptrtoaddr_64(ptr %p) {
+; CHECK-LABEL: ptrtoaddr_64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    notq %rax
+; CHECK-NEXT:    retq
+entry:
+  %addr = ptrtoaddr ptr %p to i64
+  %ret = xor i64 %addr, -1
+  ret i64 %ret
+}
+
+define i128 @ptrtoaddr_128(ptr %p) {
+; CHECK-LABEL: ptrtoaddr_128:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    notq %rax
+; CHECK-NEXT:    movq $-1, %rdx
+; CHECK-NEXT:    retq
+entry:
+  %addr = ptrtoaddr ptr %p to i64
+  %ext = zext i64 %addr to i128
+  %ret = xor i128 %ext, -1
+  ret i128 %ret
+}
+
+
+define <2 x i64> @ptrtoaddr_vec(<2 x ptr> %p) {
+; CHECK-LABEL: ptrtoaddr_vec:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT:    pxor %xmm1, %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %addr = ptrtoaddr <2 x ptr> %p to <2 x i64>
+  %ret = xor <2 x i64> %addr, <i64 -1, i64 -1>
+  ret <2 x i64> %ret
+}
+
+; UTC_ARGS: --disable
+
+@foo = global [16 x i8] zeroinitializer
+@addr = global i64 ptrtoaddr (ptr @foo to i64)
+; CHECK:      addr:
+; CHECK-NEXT:  .quad	foo
+; CHECK-NEXT:  .size	addr, 8
+@addr_plus_one = global i64 ptrtoaddr (ptr getelementptr (i8, ptr @foo, i64 1) to i64)
+; CHECK:      addr_plus_one:
+; CHECK-NEXT:  .quad	foo+1
+; CHECK-NEXT:  .size	addr_plus_one, 8
+@const_addr = global i64 ptrtoaddr (ptr getelementptr (i8, ptr null, i64 1) to i64)
+; CHECK:      const_addr:
+; CHECK-NEXT:  .quad	0+1
+; CHECK-NEXT:  .size	const_addr, 8
diff --git a/llvm/test/CodeGen/X86/scalarize-strict-fsetcc.ll b/llvm/test/CodeGen/X86/scalarize-strict-fsetcc.ll
new file mode 100644
index 0000000..b4c77a5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/scalarize-strict-fsetcc.ll
@@ -0,0 +1,293 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s
+
+define <1 x i1> @test_oeq_q_v1f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_oeq_q_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vucomisd %xmm1, %xmm0
+; CHECK-NEXT:    setnp %cl
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    andb %cl, %al
+; CHECK-NEXT:    retq
+  %cond = tail call <1 x i1> @llvm.experimental.constrained.fcmp.v1f64(<1 x double> %a, <1 x double> %b, metadata !"oeq", metadata !"fpexcept.strict")
+  ret <1 x i1> %cond
+}
+
+define <1 x i1> @test_ogt_q_v1f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_ogt_q_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vucomisd %xmm1, %xmm0
+; CHECK-NEXT:    seta %al
+; CHECK-NEXT:    retq
+  %cond = tail call <1 x i1> @llvm.experimental.constrained.fcmp.v1f64(<1 x double> %a, <1 x double> %b, metadata !"ogt", metadata !"fpexcept.strict")
+  ret <1 x i1> %cond
+}
+
+define <1 x i1> @test_oge_q_v1f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_oge_q_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vucomisd %xmm1, %xmm0
+; CHECK-NEXT:    setae %al
+; CHECK-NEXT:    retq
+  %cond = tail call <1 x i1> @llvm.experimental.constrained.fcmp.v1f64(<1 x double> %a, <1 x double> %b, metadata !"oge", metadata !"fpexcept.strict")
+  ret <1 x i1> %cond
+}
+
+define <1 x i1> @test_olt_q_v1f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_olt_q_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vucomisd %xmm0, %xmm1
+; CHECK-NEXT:    seta %al
+; CHECK-NEXT:    retq
+  %cond = tail call <1 x i1> @llvm.experimental.constrained.fcmp.v1f64(<1 x double> %a, <1 x double> %b, metadata !"olt", metadata !"fpexcept.strict")
+  ret <1 x i1> %cond
+}
+
+define <1 x i1> @test_ole_q_v1f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_ole_q_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vucomisd %xmm0, %xmm1
+; CHECK-NEXT:    setae %al
+; CHECK-NEXT:    retq
+  %cond = tail call <1 x i1> @llvm.experimental.constrained.fcmp.v1f64(<1 x double> %a, <1 x double> %b, metadata !"ole", metadata !"fpexcept.strict")
+  ret <1 x i1> %cond
+}
+
+define <1 x i1> @test_one_q_v1f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_one_q_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vucomisd %xmm1, %xmm0
+; CHECK-NEXT:    setne %al
+; CHECK-NEXT:    retq
+  %cond = tail call <1 x i1> @llvm.experimental.constrained.fcmp.v1f64(<1 x double> %a, <1 x double> %b, metadata !"one", metadata !"fpexcept.strict")
+  ret <1 x i1> %cond
+}
+
+define <1 x i1> @test_ord_q_v1f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_ord_q_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vucomisd %xmm1, %xmm0
+; CHECK-NEXT:    setnp %al
+; CHECK-NEXT:    retq
+  %cond = tail call <1 x i1> @llvm.experimental.constrained.fcmp.v1f64(<1 x double> %a, <1 x double> %b, metadata !"ord", metadata !"fpexcept.strict")
+  ret <1 x i1> %cond
+}
+
+define <1 x i1> @test_ueq_q_v1f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_ueq_q_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vucomisd %xmm1, %xmm0
+; CHECK-NEXT:    setnp %al
+; CHECK-NEXT:    retq
+  %cond = tail call <1 x i1> @llvm.experimental.constrained.fcmp.v1f64(<1 x double> %a, <1 x double> %b, metadata !"ord", metadata !"fpexcept.strict")
+  ret <1 x i1> %cond
+}
+
+define <1 x i1> @test_ugt_q_v1f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_ugt_q_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vucomisd %xmm0, %xmm1
+; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    retq
+  %cond = tail call <1 x i1> @llvm.experimental.constrained.fcmp.v1f64(<1 x double> %a, <1 x double> %b, metadata !"ugt", metadata !"fpexcept.strict")
+  ret <1 x i1> %cond
+}
+
+define <1 x i1> @test_uge_q_v1f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_uge_q_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vucomisd %xmm0, %xmm1
+; CHECK-NEXT:    setbe %al
+; CHECK-NEXT:    retq
+  %cond = tail call <1 x i1> @llvm.experimental.constrained.fcmp.v1f64(<1 x double> %a, <1 x double> %b, metadata !"uge", metadata !"fpexcept.strict")
+  ret <1 x i1> %cond
+}
+
+define <1 x i1> @test_ult_q_v1f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_ult_q_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vucomisd %xmm1, %xmm0
+; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    retq
+  %cond = tail call <1 x i1> @llvm.experimental.constrained.fcmp.v1f64(<1 x double> %a, <1 x double> %b, metadata !"ult", metadata !"fpexcept.strict")
+  ret <1 x i1> %cond
+}
+
+define <1 x i1> @test_ule_q_v1f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_ule_q_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vucomisd %xmm1, %xmm0
+; CHECK-NEXT:    setbe %al
+; CHECK-NEXT:    retq
+  %cond = tail call <1 x i1> @llvm.experimental.constrained.fcmp.v1f64(<1 x double> %a, <1 x double> %b, metadata !"ule", metadata !"fpexcept.strict")
+  ret <1 x i1> %cond
+}
+
+define <1 x i1> @test_une_q_v1f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_une_q_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vucomisd %xmm1, %xmm0
+; CHECK-NEXT:    setp %cl
+; CHECK-NEXT:    setne %al
+; CHECK-NEXT:    orb %cl, %al
+; CHECK-NEXT:    retq
+  %cond = tail call <1 x i1> @llvm.experimental.constrained.fcmp.v1f64(<1 x double> %a, <1 x double> %b, metadata !"une", metadata !"fpexcept.strict")
+  ret <1 x i1> %cond
+}
+
+define <1 x i1> @test_uno_q_v1f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_uno_q_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vucomisd %xmm1, %xmm0
+; CHECK-NEXT:    setp %al
+; CHECK-NEXT:    retq
+  %cond = tail call <1 x i1> @llvm.experimental.constrained.fcmp.v1f64(<1 x double> %a, <1 x double> %b, metadata !"uno", metadata !"fpexcept.strict")
+  ret <1 x i1> %cond
+}
+
+define <1 x i1> @test_oeq_s_v1f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_oeq_s_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcomisd %xmm1, %xmm0
+; CHECK-NEXT:    setnp %cl
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    andb %cl, %al
+; CHECK-NEXT:    retq
+  %cond = tail call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %a, <1 x double> %b, metadata !"oeq", metadata !"fpexcept.strict")
+  ret <1 x i1> %cond
+}
+
+define <1 x i1> @test_ogt_s_v1f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_ogt_s_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcomisd %xmm1, %xmm0
+; CHECK-NEXT:    seta %al
+; CHECK-NEXT:    retq
+  %cond = tail call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %a, <1 x double> %b, metadata !"ogt", metadata !"fpexcept.strict")
+  ret <1 x i1> %cond
+}
+
+define <1 x i1> @test_oge_s_v1f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_oge_s_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcomisd %xmm1, %xmm0
+; CHECK-NEXT:    setae %al
+; CHECK-NEXT:    retq
+  %cond = tail call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %a, <1 x double> %b, metadata !"oge", metadata !"fpexcept.strict")
+  ret <1 x i1> %cond
+}
+
+define <1 x i1> @test_olt_s_v1f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_olt_s_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcomisd %xmm0, %xmm1
+; CHECK-NEXT:    seta %al
+; CHECK-NEXT:    retq
+  %cond = tail call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %a, <1 x double> %b, metadata !"olt", metadata !"fpexcept.strict")
+  ret <1 x i1> %cond
+}
+
+define <1 x i1> @test_ole_s_v1f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_ole_s_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcomisd %xmm0, %xmm1
+; CHECK-NEXT:    setae %al
+; CHECK-NEXT:    retq
+  %cond = tail call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %a, <1 x double> %b, metadata !"ole", metadata !"fpexcept.strict")
+  ret <1 x i1> %cond
+}
+
+define <1 x i1> @test_one_s_v1f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_one_s_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcomisd %xmm1, %xmm0
+; CHECK-NEXT:    setne %al
+; CHECK-NEXT:    retq
+  %cond = tail call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %a, <1 x double> %b, metadata !"one", metadata !"fpexcept.strict")
+  ret <1 x i1> %cond
+}
+
+define <1 x i1> @test_ord_s_v1f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_ord_s_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcomisd %xmm1, %xmm0
+; CHECK-NEXT:    setnp %al
+; CHECK-NEXT:    retq
+  %cond = tail call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %a, <1 x double> %b, metadata !"ord", metadata !"fpexcept.strict")
+  ret <1 x i1> %cond
+}
+
+define <1 x i1> @test_ueq_s_v1f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_ueq_s_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcomisd %xmm1, %xmm0
+; CHECK-NEXT:    setnp %al
+; CHECK-NEXT:    retq
+  %cond = tail call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %a, <1 x double> %b, metadata !"ord", metadata !"fpexcept.strict")
+  ret <1 x i1> %cond
+}
+
+define <1 x i1> @test_ugt_s_v1f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_ugt_s_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcomisd %xmm0, %xmm1
+; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    retq
+  %cond = tail call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %a, <1 x double> %b, metadata !"ugt", metadata !"fpexcept.strict")
+  ret <1 x i1> %cond
+}
+
+define <1 x i1> @test_uge_s_v1f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_uge_s_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcomisd %xmm0, %xmm1
+; CHECK-NEXT:    setbe %al
+; CHECK-NEXT:    retq
+  %cond = tail call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %a, <1 x double> %b, metadata !"uge", metadata !"fpexcept.strict")
+  ret <1 x i1> %cond
+}
+
+define <1 x i1> @test_ult_s_v1f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_ult_s_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcomisd %xmm1, %xmm0
+; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    retq
+  %cond = tail call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %a, <1 x double> %b, metadata !"ult", metadata !"fpexcept.strict")
+  ret <1 x i1> %cond
+}
+
+define <1 x i1> @test_ule_s_v1f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_ule_s_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcomisd %xmm1, %xmm0
+; CHECK-NEXT:    setbe %al
+; CHECK-NEXT:    retq
+  %cond = tail call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %a, <1 x double> %b, metadata !"ule", metadata !"fpexcept.strict")
+  ret <1 x i1> %cond
+}
+
+define <1 x i1> @test_une_s_v1f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_une_s_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcomisd %xmm1, %xmm0
+; CHECK-NEXT:    setp %cl
+; CHECK-NEXT:    setne %al
+; CHECK-NEXT:    orb %cl, %al
+; CHECK-NEXT:    retq
+  %cond = tail call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %a, <1 x double> %b, metadata !"une", metadata !"fpexcept.strict")
+  ret <1 x i1> %cond
+}
+
+define <1 x i1> @test_uno_s_v1f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_uno_s_v1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcomisd %xmm1, %xmm0
+; CHECK-NEXT:    setp %al
+; CHECK-NEXT:    retq
+  %cond = tail call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %a, <1 x double> %b, metadata !"uno", metadata !"fpexcept.strict")
+  ret <1 x i1> %cond
+}
+
+declare <1 x i1> @llvm.experimental.constrained.fcmp.v1f64(<1 x double>, <1 x double>, metadata, metadata)
+declare <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double>, <1 x double>, metadata, metadata)
diff --git a/llvm/test/CodeGen/X86/scheduler-asm-moves.mir b/llvm/test/CodeGen/X86/scheduler-asm-moves.mir
index 87a56a3..045b021 100644
--- a/llvm/test/CodeGen/X86/scheduler-asm-moves.mir
+++ b/llvm/test/CodeGen/X86/scheduler-asm-moves.mir
@@ -106,8 +106,8 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   hasTailCall:     true
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:           []
 callSites:       []
diff --git a/llvm/test/CodeGen/X86/select-optimize.ll b/llvm/test/CodeGen/X86/select-optimize.ll
index c7cf9cb..6cb49f2 100644
--- a/llvm/test/CodeGen/X86/select-optimize.ll
+++ b/llvm/test/CodeGen/X86/select-optimize.ll
@@ -233,7 +233,7 @@ define i32 @expensive_val_operand5(i32 %b, i32 %y, i1 %cmp) {
 ; CHECK-LABEL: @expensive_val_operand5(
 ; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[A]], align 8
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[A]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[A]])
 ; CHECK-NEXT:    [[CMP_FROZEN:%.*]] = freeze i1 [[CMP:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_END:%.*]], !prof [[PROF18]]
 ; CHECK:       select.true.sink:
@@ -245,7 +245,7 @@ define i32 @expensive_val_operand5(i32 %b, i32 %y, i1 %cmp) {
 ;
   %a = alloca i32
   %load = load i32, ptr %a, align 8
-  call void @llvm.lifetime.end.p0(i64 2, ptr nonnull %a)
+  call void @llvm.lifetime.end.p0(ptr nonnull %a)
   %x = add i32 %load, %b
   %sel = select i1 %cmp, i32 %x, i32 %y, !prof !17
   ret i32 %sel
@@ -520,7 +520,7 @@ for.body:                                         ; preds = %for.body.preheader,
 declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 ; Function Attrs: argmemonly mustprogress nocallback nofree nosync nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
+declare void @llvm.lifetime.end.p0(ptr nocapture)
 
 declare void @free(ptr nocapture)
 
diff --git a/llvm/test/CodeGen/X86/select-smin-smax.ll b/llvm/test/CodeGen/X86/select-smin-smax.ll
index a7fb60f..513983b 100644
--- a/llvm/test/CodeGen/X86/select-smin-smax.ll
+++ b/llvm/test/CodeGen/X86/select-smin-smax.ll
@@ -1,66 +1,367 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=-bmi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOBMI
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+bmi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-BMI
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+bmi < %s | FileCheck %s --check-prefixes=X64,X64-BMI
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=-bmi < %s | FileCheck %s --check-prefixes=X64,X64-NOBMI
+; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+bmi,+cmov < %s | FileCheck %s --check-prefixes=X86,X86-BMI
+; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=-bmi < %s | FileCheck %s --check-prefixes=X86,X86-NOBMI
 
+declare i8 @llvm.smax.i8(i8, i8)
+declare i8 @llvm.smin.i8(i8, i8)
+declare i16 @llvm.smax.i16(i16, i16)
+declare i16 @llvm.smin.i16(i16, i16)
 declare i32 @llvm.smax.i32(i32, i32)
 declare i32 @llvm.smin.i32(i32, i32)
 declare i64 @llvm.smax.i64(i64, i64)
 declare i64 @llvm.smin.i64(i64, i64)
+declare i128 @llvm.smax.i128(i128, i128)
+declare i128 @llvm.smin.i128(i128, i128)
+
+define i8 @test_i8_smax(i8 %a) nounwind {
+; X64-LABEL: test_i8_smax:
+; X64:       # %bb.0:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testb %dil, %dil
+; X64-NEXT:    cmovgl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X86-BMI-LABEL: test_i8_smax:
+; X86-BMI:       # %bb.0:
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    xorl %eax, %eax
+; X86-BMI-NEXT:    testb %cl, %cl
+; X86-BMI-NEXT:    cmovgl %ecx, %eax
+; X86-BMI-NEXT:    # kill: def $al killed $al killed $eax
+; X86-BMI-NEXT:    retl
+;
+; X86-NOBMI-LABEL: test_i8_smax:
+; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    testb %al, %al
+; X86-NOBMI-NEXT:    jg .LBB0_2
+; X86-NOBMI-NEXT:  # %bb.1:
+; X86-NOBMI-NEXT:    xorl %eax, %eax
+; X86-NOBMI-NEXT:  .LBB0_2:
+; X86-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NOBMI-NEXT:    retl
+  %r = call i8 @llvm.smax.i8(i8 %a, i8 0)
+  ret i8 %r
+}
+
+define i8 @test_i8_smin(i8 %a) nounwind {
+; X64-LABEL: test_i8_smin:
+; X64:       # %bb.0:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testb %dil, %dil
+; X64-NEXT:    cmovsl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X86-BMI-LABEL: test_i8_smin:
+; X86-BMI:       # %bb.0:
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    xorl %eax, %eax
+; X86-BMI-NEXT:    testb %cl, %cl
+; X86-BMI-NEXT:    cmovsl %ecx, %eax
+; X86-BMI-NEXT:    # kill: def $al killed $al killed $eax
+; X86-BMI-NEXT:    retl
+;
+; X86-NOBMI-LABEL: test_i8_smin:
+; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    testb %al, %al
+; X86-NOBMI-NEXT:    js .LBB1_2
+; X86-NOBMI-NEXT:  # %bb.1:
+; X86-NOBMI-NEXT:    xorl %eax, %eax
+; X86-NOBMI-NEXT:  .LBB1_2:
+; X86-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NOBMI-NEXT:    retl
+  %r = call i8 @llvm.smin.i8(i8 %a, i8 0)
+  ret i8 %r
+}
+
+define i16 @test_i16_smax(i16 %a) nounwind {
+; X64-LABEL: test_i16_smax:
+; X64:       # %bb.0:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testw %di, %di
+; X64-NEXT:    cmovgl %edi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; X86-BMI-LABEL: test_i16_smax:
+; X86-BMI:       # %bb.0:
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    xorl %eax, %eax
+; X86-BMI-NEXT:    testw %cx, %cx
+; X86-BMI-NEXT:    cmovgl %ecx, %eax
+; X86-BMI-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-BMI-NEXT:    retl
+;
+; X86-NOBMI-LABEL: test_i16_smax:
+; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    testw %ax, %ax
+; X86-NOBMI-NEXT:    jg .LBB2_2
+; X86-NOBMI-NEXT:  # %bb.1:
+; X86-NOBMI-NEXT:    xorl %eax, %eax
+; X86-NOBMI-NEXT:  .LBB2_2:
+; X86-NOBMI-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NOBMI-NEXT:    retl
+  %r = call i16 @llvm.smax.i16(i16 %a, i16 0)
+  ret i16 %r
+}
+
+define i16 @test_i16_smin(i16 %a) nounwind {
+; X64-LABEL: test_i16_smin:
+; X64:       # %bb.0:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testw %di, %di
+; X64-NEXT:    cmovsl %edi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; X86-BMI-LABEL: test_i16_smin:
+; X86-BMI:       # %bb.0:
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    xorl %eax, %eax
+; X86-BMI-NEXT:    testw %cx, %cx
+; X86-BMI-NEXT:    cmovsl %ecx, %eax
+; X86-BMI-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-BMI-NEXT:    retl
+;
+; X86-NOBMI-LABEL: test_i16_smin:
+; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    testw %ax, %ax
+; X86-NOBMI-NEXT:    js .LBB3_2
+; X86-NOBMI-NEXT:  # %bb.1:
+; X86-NOBMI-NEXT:    xorl %eax, %eax
+; X86-NOBMI-NEXT:  .LBB3_2:
+; X86-NOBMI-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NOBMI-NEXT:    retl
+  %r = call i16 @llvm.smin.i16(i16 %a, i16 0)
+  ret i16 %r
+}
 
 define i32 @test_i32_smax(i32 %a) nounwind {
-; CHECK-NOBMI-LABEL: test_i32_smax:
-; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %eax, %eax
-; CHECK-NOBMI-NEXT:    testl %edi, %edi
-; CHECK-NOBMI-NEXT:    cmovgl %edi, %eax
-; CHECK-NOBMI-NEXT:    retq
-;
-; CHECK-BMI-LABEL: test_i32_smax:
-; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    movl %edi, %eax
-; CHECK-BMI-NEXT:    sarl $31, %eax
-; CHECK-BMI-NEXT:    andnl %edi, %eax, %eax
-; CHECK-BMI-NEXT:    retq
+; X64-BMI-LABEL: test_i32_smax:
+; X64-BMI:       # %bb.0:
+; X64-BMI-NEXT:    movl %edi, %eax
+; X64-BMI-NEXT:    sarl $31, %eax
+; X64-BMI-NEXT:    andnl %edi, %eax, %eax
+; X64-BMI-NEXT:    retq
+;
+; X64-NOBMI-LABEL: test_i32_smax:
+; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    xorl %eax, %eax
+; X64-NOBMI-NEXT:    testl %edi, %edi
+; X64-NOBMI-NEXT:    cmovgl %edi, %eax
+; X64-NOBMI-NEXT:    retq
+;
+; X86-BMI-LABEL: test_i32_smax:
+; X86-BMI:       # %bb.0:
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT:    movl %eax, %ecx
+; X86-BMI-NEXT:    sarl $31, %ecx
+; X86-BMI-NEXT:    andnl %eax, %ecx, %eax
+; X86-BMI-NEXT:    retl
+;
+; X86-NOBMI-LABEL: test_i32_smax:
+; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    testl %eax, %eax
+; X86-NOBMI-NEXT:    jg .LBB4_2
+; X86-NOBMI-NEXT:  # %bb.1:
+; X86-NOBMI-NEXT:    xorl %eax, %eax
+; X86-NOBMI-NEXT:  .LBB4_2:
+; X86-NOBMI-NEXT:    retl
   %r = call i32 @llvm.smax.i32(i32 %a, i32 0)
   ret i32 %r
 }
 
 define i32 @test_i32_smin(i32 %a) nounwind {
-; CHECK-LABEL: test_i32_smin:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    sarl $31, %eax
-; CHECK-NEXT:    andl %edi, %eax
-; CHECK-NEXT:    retq
+; X64-LABEL: test_i32_smin:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    sarl $31, %eax
+; X64-NEXT:    andl %edi, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i32_smin:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    retl
   %r = call i32 @llvm.smin.i32(i32 %a, i32 0)
   ret i32 %r
 }
 
 define i64 @test_i64_smax(i64 %a) nounwind {
-; CHECK-NOBMI-LABEL: test_i64_smax:
-; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %eax, %eax
-; CHECK-NOBMI-NEXT:    testq %rdi, %rdi
-; CHECK-NOBMI-NEXT:    cmovgq %rdi, %rax
-; CHECK-NOBMI-NEXT:    retq
-;
-; CHECK-BMI-LABEL: test_i64_smax:
-; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    movq %rdi, %rax
-; CHECK-BMI-NEXT:    sarq $63, %rax
-; CHECK-BMI-NEXT:    andnq %rdi, %rax, %rax
-; CHECK-BMI-NEXT:    retq
+; X64-BMI-LABEL: test_i64_smax:
+; X64-BMI:       # %bb.0:
+; X64-BMI-NEXT:    movq %rdi, %rax
+; X64-BMI-NEXT:    sarq $63, %rax
+; X64-BMI-NEXT:    andnq %rdi, %rax, %rax
+; X64-BMI-NEXT:    retq
+;
+; X64-NOBMI-LABEL: test_i64_smax:
+; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    xorl %eax, %eax
+; X64-NOBMI-NEXT:    testq %rdi, %rdi
+; X64-NOBMI-NEXT:    cmovgq %rdi, %rax
+; X64-NOBMI-NEXT:    retq
+;
+; X86-BMI-LABEL: test_i64_smax:
+; X86-BMI:       # %bb.0:
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT:    xorl %eax, %eax
+; X86-BMI-NEXT:    testl %edx, %edx
+; X86-BMI-NEXT:    cmovlel %eax, %edx
+; X86-BMI-NEXT:    cmovnsl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT:    retl
+;
+; X86-NOBMI-LABEL: test_i64_smax:
+; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:    testl %edx, %edx
+; X86-NOBMI-NEXT:    movl $0, %eax
+; X86-NOBMI-NEXT:    jns .LBB6_1
+; X86-NOBMI-NEXT:  # %bb.2:
+; X86-NOBMI-NEXT:    jle .LBB6_3
+; X86-NOBMI-NEXT:  .LBB6_4:
+; X86-NOBMI-NEXT:    retl
+; X86-NOBMI-NEXT:  .LBB6_1:
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    jg .LBB6_4
+; X86-NOBMI-NEXT:  .LBB6_3:
+; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    retl
   %r = call i64 @llvm.smax.i64(i64 %a, i64 0)
   ret i64 %r
 }
 
 define i64 @test_i64_smin(i64 %a) nounwind {
-; CHECK-LABEL: test_i64_smin:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    sarq $63, %rax
-; CHECK-NEXT:    andq %rdi, %rax
-; CHECK-NEXT:    retq
+; X64-LABEL: test_i64_smin:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    andq %rdi, %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i64_smin:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    andl %eax, %edx
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
   %r = call i64 @llvm.smin.i64(i64 %a, i64 0)
   ret i64 %r
 }
+
+define i128 @test_i128_smax(i128 %a) nounwind {
+; X64-LABEL: test_i128_smax:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    testq %rsi, %rsi
+; X64-NEXT:    cmovsq %rdx, %rax
+; X64-NEXT:    cmovgq %rsi, %rdx
+; X64-NEXT:    retq
+;
+; X86-BMI-LABEL: test_i128_smax:
+; X86-BMI:       # %bb.0:
+; X86-BMI-NEXT:    pushl %edi
+; X86-BMI-NEXT:    pushl %esi
+; X86-BMI-NEXT:    pushl %eax
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    xorl %edx, %edx
+; X86-BMI-NEXT:    testl %ecx, %ecx
+; X86-BMI-NEXT:    cmovlel %edx, %ecx
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT:    cmovsl %edx, %esi
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT:    cmovsl %edx, %edi
+; X86-BMI-NEXT:    cmovnsl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT:    movl %ecx, 12(%eax)
+; X86-BMI-NEXT:    movl %edx, 8(%eax)
+; X86-BMI-NEXT:    movl %edi, 4(%eax)
+; X86-BMI-NEXT:    movl %esi, (%eax)
+; X86-BMI-NEXT:    addl $4, %esp
+; X86-BMI-NEXT:    popl %esi
+; X86-BMI-NEXT:    popl %edi
+; X86-BMI-NEXT:    retl $4
+;
+; X86-NOBMI-LABEL: test_i128_smax:
+; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %edi
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    pushl %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    testl %ecx, %ecx
+; X86-NOBMI-NEXT:    movl $0, %edx
+; X86-NOBMI-NEXT:    movl $0, %esi
+; X86-NOBMI-NEXT:    movl $0, %edi
+; X86-NOBMI-NEXT:    js .LBB8_2
+; X86-NOBMI-NEXT:  # %bb.1:
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:  .LBB8_2:
+; X86-NOBMI-NEXT:    jg .LBB8_4
+; X86-NOBMI-NEXT:  # %bb.3:
+; X86-NOBMI-NEXT:    xorl %ecx, %ecx
+; X86-NOBMI-NEXT:  .LBB8_4:
+; X86-NOBMI-NEXT:    movl %ecx, 12(%eax)
+; X86-NOBMI-NEXT:    movl %edi, 8(%eax)
+; X86-NOBMI-NEXT:    movl %esi, 4(%eax)
+; X86-NOBMI-NEXT:    movl %edx, (%eax)
+; X86-NOBMI-NEXT:    addl $4, %esp
+; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    retl $4
+  %r = call i128 @llvm.smax.i128(i128 %a, i128 0)
+  ret i128 %r
+}
+
+define i128 @test_i128_smin(i128 %a) nounwind {
+; X64-LABEL: test_i128_smin:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rsi, %rdx
+; X64-NEXT:    sarq $63, %rdx
+; X64-NEXT:    andq %rdx, %rax
+; X64-NEXT:    andq %rsi, %rdx
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i128_smin:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    andl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    andl %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl %edx, %edi
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
+  %r = call i128 @llvm.smin.i128(i128 %a, i128 0)
+  ret i128 %r
+}
diff --git a/llvm/test/CodeGen/X86/shrink_wrap_dbg_value.mir b/llvm/test/CodeGen/X86/shrink_wrap_dbg_value.mir
index af2a18d..52cbcf2 100644
--- a/llvm/test/CodeGen/X86/shrink_wrap_dbg_value.mir
+++ b/llvm/test/CodeGen/X86/shrink_wrap_dbg_value.mir
@@ -120,10 +120,12 @@ frameInfo:
   hasOpaqueSPAdjustment: false
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
-  # CHECK: savePoint:       '%bb.1'
-  # CHECK: restorePoint:    '%bb.3'
-  savePoint:       ''
-  restorePoint:    ''
+  # CHECK:      savePoint:
+  # CHECK-NEXT:   - point:           '%bb.1'
+  # CHECK:      restorePoint:
+  # CHECK-NEXT:   - point:           '%bb.3'
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      
   - { id: 0, type: default, offset: 4, size: 4, alignment: 4, stack-id: default,
       isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true }
diff --git a/llvm/test/CodeGen/X86/stack-clash-extra-huge.ll b/llvm/test/CodeGen/X86/stack-clash-extra-huge.ll
index d9b20f5..4c8bb62 100644
--- a/llvm/test/CodeGen/X86/stack-clash-extra-huge.ll
+++ b/llvm/test/CodeGen/X86/stack-clash-extra-huge.ll
@@ -16,13 +16,13 @@ define i32 @foo() local_unnamed_addr #0 {
 ; CHECK-X64-NEXT:    cmpq %r11, %rsp
 ; CHECK-X64-NEXT:    jne .LBB0_1
 ; CHECK-X64-NEXT:  # %bb.2:
-; CHECK-X64-NEXT:    subq $3976, %rsp # imm = 0xF88
+; CHECK-X64-NEXT:    subq $3992, %rsp # imm = 0xF98
 ; CHECK-X64-NEXT:    .cfi_def_cfa_register %rsp
-; CHECK-X64-NEXT:    .cfi_def_cfa_offset 4799999888
-; CHECK-X64-NEXT:    movl $1, 264(%rsp)
-; CHECK-X64-NEXT:    movl $1, 28664(%rsp)
-; CHECK-X64-NEXT:    movl -128(%rsp), %eax
-; CHECK-X64-NEXT:    movabsq $4799999880, %rcx # imm = 0x11E1A2F88
+; CHECK-X64-NEXT:    .cfi_def_cfa_offset 4799999904
+; CHECK-X64-NEXT:    movl $1, 280(%rsp)
+; CHECK-X64-NEXT:    movl $1, 28680(%rsp)
+; CHECK-X64-NEXT:    movl -112(%rsp), %eax
+; CHECK-X64-NEXT:    movabsq $4799999896, %rcx # imm = 0x11E1A2F98
 ; CHECK-X64-NEXT:    addq %rcx, %rsp
 ; CHECK-X64-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-X64-NEXT:    retq
@@ -30,10 +30,10 @@ define i32 @foo() local_unnamed_addr #0 {
 ; CHECK-X86-LABEL: foo:
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    ud2
-; CHECK-X86-NEXT:    .cfi_def_cfa_offset 4800000016
-; CHECK-X86-NEXT:    movl $1, 392(%esp)
-; CHECK-X86-NEXT:    movl $1, 28792(%esp)
-; CHECK-X86-NEXT:    movl (%esp), %eax
+; CHECK-X86-NEXT:    .cfi_def_cfa_offset 4800000032
+; CHECK-X86-NEXT:    movl $1, 408(%esp)
+; CHECK-X86-NEXT:    movl $1, 28808(%esp)
+; CHECK-X86-NEXT:    movl 16(%esp), %eax
 ; CHECK-X86-NEXT:    ud2
 ; CHECK-X86-NEXT:    .cfi_def_cfa_offset 4
 ; CHECK-X86-NEXT:    retl
@@ -41,10 +41,10 @@ define i32 @foo() local_unnamed_addr #0 {
 ; CHECK-X32-LABEL: foo:
 ; CHECK-X32:       # %bb.0:
 ; CHECK-X32-NEXT:    ud2
-; CHECK-X32-NEXT:    .cfi_def_cfa_offset 4799999888
-; CHECK-X32-NEXT:    movl $1, 264(%esp)
-; CHECK-X32-NEXT:    movl $1, 28664(%esp)
-; CHECK-X32-NEXT:    movl -128(%esp), %eax
+; CHECK-X32-NEXT:    .cfi_def_cfa_offset 4799999904
+; CHECK-X32-NEXT:    movl $1, 280(%esp)
+; CHECK-X32-NEXT:    movl $1, 28680(%esp)
+; CHECK-X32-NEXT:    movl -112(%esp), %eax
 ; CHECK-X32-NEXT:    ud2
 ; CHECK-X32-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-X32-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/stack-clash-huge.ll b/llvm/test/CodeGen/X86/stack-clash-huge.ll
index c999077..0e8c215 100644
--- a/llvm/test/CodeGen/X86/stack-clash-huge.ll
+++ b/llvm/test/CodeGen/X86/stack-clash-huge.ll
@@ -16,13 +16,13 @@ define i32 @foo() local_unnamed_addr #0 {
 ; CHECK-X64-NEXT:    cmpq %r11, %rsp
 ; CHECK-X64-NEXT:    jne .LBB0_1
 ; CHECK-X64-NEXT:  # %bb.2:
-; CHECK-X64-NEXT:    subq $1928, %rsp # imm = 0x788
+; CHECK-X64-NEXT:    subq $1944, %rsp # imm = 0x798
 ; CHECK-X64-NEXT:    .cfi_def_cfa_register %rsp
-; CHECK-X64-NEXT:    .cfi_def_cfa_offset 2399999888
-; CHECK-X64-NEXT:    movl $1, 264(%rsp)
-; CHECK-X64-NEXT:    movl $1, 28664(%rsp)
-; CHECK-X64-NEXT:    movl -128(%rsp), %eax
-; CHECK-X64-NEXT:    movl $2399999880, %ecx # imm = 0x8F0D1788
+; CHECK-X64-NEXT:    .cfi_def_cfa_offset 2399999904
+; CHECK-X64-NEXT:    movl $1, 280(%rsp)
+; CHECK-X64-NEXT:    movl $1, 28680(%rsp)
+; CHECK-X64-NEXT:    movl -112(%rsp), %eax
+; CHECK-X64-NEXT:    movl $2399999896, %ecx # imm = 0x8F0D1798
 ; CHECK-X64-NEXT:    addq %rcx, %rsp
 ; CHECK-X64-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-X64-NEXT:    retq
@@ -39,13 +39,13 @@ define i32 @foo() local_unnamed_addr #0 {
 ; CHECK-X86-NEXT:    cmpl %eax, %esp
 ; CHECK-X86-NEXT:    jne .LBB0_1
 ; CHECK-X86-NEXT:  # %bb.2:
-; CHECK-X86-NEXT:    subl $2060, %esp # imm = 0x80C
+; CHECK-X86-NEXT:    subl $2076, %esp # imm = 0x81C
 ; CHECK-X86-NEXT:    .cfi_def_cfa_register %esp
-; CHECK-X86-NEXT:    .cfi_def_cfa_offset 2400000016
-; CHECK-X86-NEXT:    movl $1, 392(%esp)
-; CHECK-X86-NEXT:    movl $1, 28792(%esp)
-; CHECK-X86-NEXT:    movl (%esp), %eax
-; CHECK-X86-NEXT:    movl $2400000012, %ecx # imm = 0x8F0D180C
+; CHECK-X86-NEXT:    .cfi_def_cfa_offset 2400000032
+; CHECK-X86-NEXT:    movl $1, 408(%esp)
+; CHECK-X86-NEXT:    movl $1, 28808(%esp)
+; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl $2400000028, %ecx # imm = 0x8F0D181C
 ; CHECK-X86-NEXT:    addl %ecx, %esp
 ; CHECK-X86-NEXT:    .cfi_def_cfa_offset 4
 ; CHECK-X86-NEXT:    retl
@@ -62,13 +62,13 @@ define i32 @foo() local_unnamed_addr #0 {
 ; CHECK-X32-NEXT:    cmpl %r11d, %esp
 ; CHECK-X32-NEXT:    jne .LBB0_1
 ; CHECK-X32-NEXT:  # %bb.2:
-; CHECK-X32-NEXT:    subl $1928, %esp # imm = 0x788
+; CHECK-X32-NEXT:    subl $1944, %esp # imm = 0x798
 ; CHECK-X32-NEXT:    .cfi_def_cfa_register %rsp
-; CHECK-X32-NEXT:    .cfi_def_cfa_offset 2399999888
-; CHECK-X32-NEXT:    movl $1, 264(%esp)
-; CHECK-X32-NEXT:    movl $1, 28664(%esp)
-; CHECK-X32-NEXT:    movl -128(%esp), %eax
-; CHECK-X32-NEXT:    movl $2399999880, %ecx # imm = 0x8F0D1788
+; CHECK-X32-NEXT:    .cfi_def_cfa_offset 2399999904
+; CHECK-X32-NEXT:    movl $1, 280(%esp)
+; CHECK-X32-NEXT:    movl $1, 28680(%esp)
+; CHECK-X32-NEXT:    movl -112(%esp), %eax
+; CHECK-X32-NEXT:    movl $2399999896, %ecx # imm = 0x8F0D1798
 ; CHECK-X32-NEXT:    addl %ecx, %esp
 ; CHECK-X32-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-X32-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/statepoint-fixup-call.mir b/llvm/test/CodeGen/X86/statepoint-fixup-call.mir
index 2e0efa9..100d892 100644
--- a/llvm/test/CodeGen/X86/statepoint-fixup-call.mir
+++ b/llvm/test/CodeGen/X86/statepoint-fixup-call.mir
@@ -58,8 +58,8 @@ frameInfo:
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:           []
 callSites:       []
diff --git a/llvm/test/CodeGen/X86/statepoint-fixup-copy-prop-neg.mir b/llvm/test/CodeGen/X86/statepoint-fixup-copy-prop-neg.mir
index e87e4d7..257ea88 100644
--- a/llvm/test/CodeGen/X86/statepoint-fixup-copy-prop-neg.mir
+++ b/llvm/test/CodeGen/X86/statepoint-fixup-copy-prop-neg.mir
@@ -59,8 +59,8 @@ frameInfo:
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:
   - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 8,
diff --git a/llvm/test/CodeGen/X86/statepoint-fixup-invoke.mir b/llvm/test/CodeGen/X86/statepoint-fixup-invoke.mir
index a44b625..16796d5 100644
--- a/llvm/test/CodeGen/X86/statepoint-fixup-invoke.mir
+++ b/llvm/test/CodeGen/X86/statepoint-fixup-invoke.mir
@@ -76,8 +76,8 @@ frameInfo:
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:           []
 callSites:       []
diff --git a/llvm/test/CodeGen/X86/statepoint-fixup-shared-ehpad.mir b/llvm/test/CodeGen/X86/statepoint-fixup-shared-ehpad.mir
index a2a194b..336b55c 100644
--- a/llvm/test/CodeGen/X86/statepoint-fixup-shared-ehpad.mir
+++ b/llvm/test/CodeGen/X86/statepoint-fixup-shared-ehpad.mir
@@ -93,8 +93,8 @@ frameInfo:
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:           []
 callSites:       []
diff --git a/llvm/test/CodeGen/X86/statepoint-fixup-undef-def.mir b/llvm/test/CodeGen/X86/statepoint-fixup-undef-def.mir
index 1a88a17..8dab329 100644
--- a/llvm/test/CodeGen/X86/statepoint-fixup-undef-def.mir
+++ b/llvm/test/CodeGen/X86/statepoint-fixup-undef-def.mir
@@ -73,8 +73,8 @@ frameInfo:
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:
   - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 8,
diff --git a/llvm/test/CodeGen/X86/statepoint-fixup-undef.mir b/llvm/test/CodeGen/X86/statepoint-fixup-undef.mir
index 05d04f5..ef053ca 100644
--- a/llvm/test/CodeGen/X86/statepoint-fixup-undef.mir
+++ b/llvm/test/CodeGen/X86/statepoint-fixup-undef.mir
@@ -71,8 +71,8 @@ frameInfo:
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:
   - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 8,
diff --git a/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir b/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir
index 5f05270..29c22bb 100644
--- a/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir
+++ b/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir
@@ -241,8 +241,8 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   hasTailCall:     false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:           []
 callSites:       []
diff --git a/llvm/test/CodeGen/X86/statepoint-invoke-ra-hoist-copies.mir b/llvm/test/CodeGen/X86/statepoint-invoke-ra-hoist-copies.mir
index cf91282..a50b0a5 100644
--- a/llvm/test/CodeGen/X86/statepoint-invoke-ra-hoist-copies.mir
+++ b/llvm/test/CodeGen/X86/statepoint-invoke-ra-hoist-copies.mir
@@ -408,8 +408,8 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   hasTailCall:     false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:           []
 callSites:       []
diff --git a/llvm/test/CodeGen/X86/statepoint-invoke-ra-inline-spiller.mir b/llvm/test/CodeGen/X86/statepoint-invoke-ra-inline-spiller.mir
index fcebc69..0c5c366 100644
--- a/llvm/test/CodeGen/X86/statepoint-invoke-ra-inline-spiller.mir
+++ b/llvm/test/CodeGen/X86/statepoint-invoke-ra-inline-spiller.mir
@@ -185,8 +185,8 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   hasTailCall:     false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:
   - { id: 0, name: '', type: default, offset: 0, size: 4, alignment: 4,
diff --git a/llvm/test/CodeGen/X86/statepoint-invoke-ra-remove-back-copies.mir b/llvm/test/CodeGen/X86/statepoint-invoke-ra-remove-back-copies.mir
index 8bb39a0..05c7ceb 100644
--- a/llvm/test/CodeGen/X86/statepoint-invoke-ra-remove-back-copies.mir
+++ b/llvm/test/CodeGen/X86/statepoint-invoke-ra-remove-back-copies.mir
@@ -236,8 +236,8 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   hasTailCall:     false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:
   - { id: 0, name: '', type: default, offset: 0, size: 4, alignment: 4,
diff --git a/llvm/test/CodeGen/X86/statepoint-invoke-ra.mir b/llvm/test/CodeGen/X86/statepoint-invoke-ra.mir
index da651039..5ce76a3e 100644
--- a/llvm/test/CodeGen/X86/statepoint-invoke-ra.mir
+++ b/llvm/test/CodeGen/X86/statepoint-invoke-ra.mir
@@ -182,8 +182,8 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   hasTailCall:     false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:
   - { id: 0, name: '', type: default, offset: 0, size: 4, alignment: 4,
diff --git a/llvm/test/CodeGen/X86/statepoint-vreg-folding.mir b/llvm/test/CodeGen/X86/statepoint-vreg-folding.mir
index d40a9a0..8b6445b 100644
--- a/llvm/test/CodeGen/X86/statepoint-vreg-folding.mir
+++ b/llvm/test/CodeGen/X86/statepoint-vreg-folding.mir
@@ -123,8 +123,8 @@ frameInfo:
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:
   - { id: 0, type: default, offset: 16, size: 8, alignment: 16, stack-id: default,
       isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
diff --git a/llvm/test/CodeGen/X86/tied-depbreak.mir b/llvm/test/CodeGen/X86/tied-depbreak.mir
index 4eca287..cc3d8df 100644
--- a/llvm/test/CodeGen/X86/tied-depbreak.mir
+++ b/llvm/test/CodeGen/X86/tied-depbreak.mir
@@ -40,8 +40,8 @@ frameInfo:
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:           []
 callSites:       []
diff --git a/llvm/test/CodeGen/X86/unfoldMemoryOperand.mir b/llvm/test/CodeGen/X86/unfoldMemoryOperand.mir
index 135b14d..75953af 100644
--- a/llvm/test/CodeGen/X86/unfoldMemoryOperand.mir
+++ b/llvm/test/CodeGen/X86/unfoldMemoryOperand.mir
@@ -72,8 +72,8 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   hasTailCall:     false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:           []
 callSites:       []
diff --git a/llvm/test/CodeGen/X86/vector-llrint-f16.ll b/llvm/test/CodeGen/X86/vector-llrint-f16.ll
index 5e5c5849..eb7be61 100644
--- a/llvm/test/CodeGen/X86/vector-llrint-f16.ll
+++ b/llvm/test/CodeGen/X86/vector-llrint-f16.ll
@@ -1,10 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; FIXME: crash "Do not know how to split the result of this operator!"
+; SKIP: sed 's/XRINT/lrint/g' %s | llc -mtriple=i686-unknown | FileCheck %s --check-prefix=X86
+; SKIP: sed 's/XRINT/llrint/g' %s | llc -mtriple=i686-unknown | FileCheck %s --check-prefix=X86
 ; RUN: sed 's/XRINT/lrint/g' %s | llc -mtriple=x86_64-unknown -mattr=avx2,f16c | FileCheck %s --check-prefix=AVX
 ; RUN: sed 's/XRINT/llrint/g' %s | llc -mtriple=x86_64-unknown -mattr=avx2,f16c | FileCheck %s --check-prefix=AVX
 ; RUN: sed 's/XRINT/lrint/g' %s | llc -mtriple=x86_64-unknown -mattr=avx512fp16,avx512vl | FileCheck %s --check-prefix=FP16
 ; RUN: sed 's/XRINT/llrint/g' %s | llc -mtriple=x86_64-unknown -mattr=avx512fp16,avx512vl | FileCheck %s --check-prefix=FP16
 
-define <1 x i64> @llrint_v1i64_v1f16(<1 x half> %x) {
+define <1 x i64> @llrint_v1i64_v1f16(<1 x half> %x) nounwind {
 ; AVX-LABEL: llrint_v1i64_v1f16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
@@ -22,7 +25,7 @@ define <1 x i64> @llrint_v1i64_v1f16(<1 x half> %x) {
   ret <1 x i64> %a
 }
 
-define <2 x i64> @llrint_v2i64_v2f16(<2 x half> %x) {
+define <2 x i64> @llrint_v2i64_v2f16(<2 x half> %x) nounwind {
 ; AVX-LABEL: llrint_v2i64_v2f16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcvtph2ps %xmm0, %xmm1
@@ -49,7 +52,7 @@ define <2 x i64> @llrint_v2i64_v2f16(<2 x half> %x) {
   ret <2 x i64> %a
 }
 
-define <4 x i64> @llrint_v4i64_v4f16(<4 x half> %x) {
+define <4 x i64> @llrint_v4i64_v4f16(<4 x half> %x) nounwind {
 ; AVX-LABEL: llrint_v4i64_v4f16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm1
@@ -92,7 +95,7 @@ define <4 x i64> @llrint_v4i64_v4f16(<4 x half> %x) {
   ret <4 x i64> %a
 }
 
-define <8 x i64> @llrint_v8i64_v8f16(<8 x half> %x) {
+define <8 x i64> @llrint_v8i64_v8f16(<8 x half> %x) nounwind {
 ; AVX-LABEL: llrint_v8i64_v8f16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm1
@@ -167,7 +170,7 @@ define <8 x i64> @llrint_v8i64_v8f16(<8 x half> %x) {
   ret <8 x i64> %a
 }
 
-define <16 x i64> @llrint_v16i64_v16f16(<16 x half> %x) {
+define <16 x i64> @llrint_v16i64_v16f16(<16 x half> %x) nounwind {
 ; AVX-LABEL: llrint_v16i64_v16f16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovdqa %ymm0, %ymm2
@@ -307,7 +310,7 @@ define <16 x i64> @llrint_v16i64_v16f16(<16 x half> %x) {
   ret <16 x i64> %a
 }
 
-define <32 x i64> @llrint_v32i64_v32f16(<32 x half> %x) {
+define <32 x i64> @llrint_v32i64_v32f16(<32 x half> %x) nounwind {
 ; AVX-LABEL: llrint_v32i64_v32f16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    movq %rdi, %rax
diff --git a/llvm/test/CodeGen/X86/vector-llrint.ll b/llvm/test/CodeGen/X86/vector-llrint.ll
index 7017eb6..6fd1a35 100644
--- a/llvm/test/CodeGen/X86/vector-llrint.ll
+++ b/llvm/test/CodeGen/X86/vector-llrint.ll
@@ -1,10 +1,25 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86
 ; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512dq,avx512vl | FileCheck %s --check-prefixes=AVX512DQ
 
-define <1 x i64> @llrint_v1i64_v1f32(<1 x float> %x) {
+define <1 x i64> @llrint_v1i64_v1f32(<1 x float> %x) nounwind  {
+; X86-LABEL: llrint_v1i64_v1f32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    flds 8(%ebp)
+; X86-NEXT:    fistpll (%esp)
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
 ; SSE-LABEL: llrint_v1i64_v1f32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cvtss2si %xmm0, %rax
@@ -24,7 +39,34 @@ define <1 x i64> @llrint_v1i64_v1f32(<1 x float> %x) {
 }
 declare <1 x i64> @llvm.llrint.v1i64.v1f32(<1 x float>)
 
-define <2 x i64> @llrint_v2i64_v2f32(<2 x float> %x) {
+define <2 x i64> @llrint_v2i64_v2f32(<2 x float> %x) nounwind {
+; X86-LABEL: llrint_v2i64_v2f32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    flds 16(%ebp)
+; X86-NEXT:    flds 12(%ebp)
+; X86-NEXT:    fistpll (%esp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl (%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
 ; SSE-LABEL: llrint_v2i64_v2f32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cvtss2si %xmm0, %rax
@@ -55,7 +97,54 @@ define <2 x i64> @llrint_v2i64_v2f32(<2 x float> %x) {
 }
 declare <2 x i64> @llvm.llrint.v2i64.v2f32(<2 x float>)
 
-define <4 x i64> @llrint_v4i64_v4f32(<4 x float> %x) {
+define <4 x i64> @llrint_v4i64_v4f32(<4 x float> %x) nounwind {
+; X86-LABEL: llrint_v4i64_v4f32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    flds 24(%ebp)
+; X86-NEXT:    flds 20(%ebp)
+; X86-NEXT:    flds 16(%ebp)
+; X86-NEXT:    flds 12(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, 28(%eax)
+; X86-NEXT:    movl %ecx, 24(%eax)
+; X86-NEXT:    movl %edx, 20(%eax)
+; X86-NEXT:    movl %ebx, 16(%eax)
+; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
 ; SSE-LABEL: llrint_v4i64_v4f32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cvtss2si %xmm0, %rax
@@ -121,7 +210,94 @@ define <4 x i64> @llrint_v4i64_v4f32(<4 x float> %x) {
 }
 declare <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float>)
 
-define <8 x i64> @llrint_v8i64_v8f32(<8 x float> %x) {
+define <8 x i64> @llrint_v8i64_v8f32(<8 x float> %x) nounwind {
+; X86-LABEL: llrint_v8i64_v8f32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $120, %esp
+; X86-NEXT:    flds 12(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    flds 16(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    flds 20(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    flds 24(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    flds 28(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    flds 32(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    flds 36(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    flds 40(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, 60(%eax)
+; X86-NEXT:    movl %ecx, 56(%eax)
+; X86-NEXT:    movl %edx, 52(%eax)
+; X86-NEXT:    movl %esi, 48(%eax)
+; X86-NEXT:    movl %edi, 44(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 40(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 36(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 32(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 28(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 24(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 20(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 16(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
 ; SSE-LABEL: llrint_v8i64_v8f32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movaps %xmm0, %xmm2
@@ -235,7 +411,174 @@ define <8 x i64> @llrint_v8i64_v8f32(<8 x float> %x) {
 }
 declare <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float>)
 
-define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) {
+define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) nounwind {
+; X86-LABEL: llrint_v16i64_v16f32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $248, %esp
+; X86-NEXT:    flds 12(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    flds 16(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    flds 20(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    flds 24(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    flds 28(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    flds 32(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    flds 36(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    flds 40(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    flds 44(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    flds 48(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    flds 52(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    flds 56(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    flds 60(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    flds 64(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    flds 68(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    flds 72(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, 124(%eax)
+; X86-NEXT:    movl %ecx, 120(%eax)
+; X86-NEXT:    movl %edx, 116(%eax)
+; X86-NEXT:    movl %esi, 112(%eax)
+; X86-NEXT:    movl %edi, 108(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 104(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 100(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 96(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 92(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 88(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 84(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 80(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 76(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 72(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 68(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 64(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 60(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 56(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 52(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 48(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 44(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 40(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 36(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 32(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 28(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 24(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 20(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 16(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
 ; SSE-LABEL: llrint_v16i64_v16f32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movq %rdi, %rax
@@ -451,7 +794,21 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) {
 }
 declare <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float>)
 
-define <1 x i64> @llrint_v1i64_v1f64(<1 x double> %x) {
+define <1 x i64> @llrint_v1i64_v1f64(<1 x double> %x) nounwind {
+; X86-LABEL: llrint_v1i64_v1f64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    fldl 8(%ebp)
+; X86-NEXT:    fistpll (%esp)
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
 ; SSE-LABEL: llrint_v1i64_v1f64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cvtsd2si %xmm0, %rax
@@ -471,7 +828,34 @@ define <1 x i64> @llrint_v1i64_v1f64(<1 x double> %x) {
 }
 declare <1 x i64> @llvm.llrint.v1i64.v1f64(<1 x double>)
 
-define <2 x i64> @llrint_v2i64_v2f64(<2 x double> %x) {
+define <2 x i64> @llrint_v2i64_v2f64(<2 x double> %x) nounwind {
+; X86-LABEL: llrint_v2i64_v2f64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    fldl 20(%ebp)
+; X86-NEXT:    fldl 12(%ebp)
+; X86-NEXT:    fistpll (%esp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl (%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
 ; SSE-LABEL: llrint_v2i64_v2f64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cvtsd2si %xmm0, %rax
@@ -502,7 +886,54 @@ define <2 x i64> @llrint_v2i64_v2f64(<2 x double> %x) {
 }
 declare <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double>)
 
-define <4 x i64> @llrint_v4i64_v4f64(<4 x double> %x) {
+define <4 x i64> @llrint_v4i64_v4f64(<4 x double> %x) nounwind {
+; X86-LABEL: llrint_v4i64_v4f64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    fldl 36(%ebp)
+; X86-NEXT:    fldl 28(%ebp)
+; X86-NEXT:    fldl 20(%ebp)
+; X86-NEXT:    fldl 12(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, 28(%eax)
+; X86-NEXT:    movl %ecx, 24(%eax)
+; X86-NEXT:    movl %edx, 20(%eax)
+; X86-NEXT:    movl %ebx, 16(%eax)
+; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
 ; SSE-LABEL: llrint_v4i64_v4f64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cvtsd2si %xmm0, %rax
@@ -566,7 +997,94 @@ define <4 x i64> @llrint_v4i64_v4f64(<4 x double> %x) {
 }
 declare <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double>)
 
-define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) {
+define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) nounwind {
+; X86-LABEL: llrint_v8i64_v8f64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $120, %esp
+; X86-NEXT:    fldl 12(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    fldl 20(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    fldl 28(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    fldl 36(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    fldl 44(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    fldl 52(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    fldl 60(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    fldl 68(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, 60(%eax)
+; X86-NEXT:    movl %ecx, 56(%eax)
+; X86-NEXT:    movl %edx, 52(%eax)
+; X86-NEXT:    movl %esi, 48(%eax)
+; X86-NEXT:    movl %edi, 44(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 40(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 36(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 32(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 28(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 24(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 20(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 16(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
 ; SSE-LABEL: llrint_v8i64_v8f64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cvtsd2si %xmm0, %rax
@@ -673,3 +1191,655 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) {
   ret <8 x i64> %a
 }
 declare <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double>)
+
+define <1 x i64> @llrint_v1i64_v1f128(<1 x fp128> %x) nounwind {
+; X86-LABEL: llrint_v1i64_v1f128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    pushl 20(%ebp)
+; X86-NEXT:    pushl 16(%ebp)
+; X86-NEXT:    pushl 12(%ebp)
+; X86-NEXT:    pushl 8(%ebp)
+; X86-NEXT:    calll llrintl
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; SSE-LABEL: llrint_v1i64_v1f128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rax
+; SSE-NEXT:    callq llrintl@PLT
+; SSE-NEXT:    popq %rcx
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: llrint_v1i64_v1f128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    pushq %rax
+; AVX-NEXT:    callq llrintl@PLT
+; AVX-NEXT:    popq %rcx
+; AVX-NEXT:    retq
+;
+; AVX512DQ-LABEL: llrint_v1i64_v1f128:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    pushq %rax
+; AVX512DQ-NEXT:    callq llrintl@PLT
+; AVX512DQ-NEXT:    popq %rcx
+; AVX512DQ-NEXT:    retq
+  %a = call <1 x i64> @llvm.llrint.v1i64.v1f128(<1 x fp128> %x)
+  ret <1 x i64> %a
+}
+declare <1 x i64> @llvm.llrint.v1i64.v1f128(<1 x fp128>)
+
+define <2 x i64> @llrint_v2i64_v2f128(<2 x fp128> %x) nounwind {
+; X86-LABEL: llrint_v2i64_v2f128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %esi
+; X86-NEXT:    pushl 24(%ebp)
+; X86-NEXT:    pushl 20(%ebp)
+; X86-NEXT:    pushl 16(%ebp)
+; X86-NEXT:    pushl 12(%ebp)
+; X86-NEXT:    calll llrintl
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    pushl 40(%ebp)
+; X86-NEXT:    pushl 36(%ebp)
+; X86-NEXT:    pushl 32(%ebp)
+; X86-NEXT:    pushl 28(%ebp)
+; X86-NEXT:    calll llrintl
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %eax, 8(%esi)
+; X86-NEXT:    movl %ebx, 4(%esi)
+; X86-NEXT:    movl %edi, (%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
+; SSE-LABEL: llrint_v2i64_v2f128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    subq $40, %rsp
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    callq llrintl@PLT
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq llrintl@PLT
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0]
+; SSE-NEXT:    addq $40, %rsp
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: llrint_v2i64_v2f128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    subq $40, %rsp
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovaps %xmm1, %xmm0
+; AVX-NEXT:    callq llrintl@PLT
+; AVX-NEXT:    vmovq %rax, %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT:    callq llrintl@PLT
+; AVX-NEXT:    vmovq %rax, %xmm0
+; AVX-NEXT:    vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX-NEXT:    addq $40, %rsp
+; AVX-NEXT:    retq
+;
+; AVX512DQ-LABEL: llrint_v2i64_v2f128:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    subq $40, %rsp
+; AVX512DQ-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512DQ-NEXT:    callq llrintl@PLT
+; AVX512DQ-NEXT:    vmovq %rax, %xmm0
+; AVX512DQ-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT:    callq llrintl@PLT
+; AVX512DQ-NEXT:    vmovq %rax, %xmm0
+; AVX512DQ-NEXT:    vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512DQ-NEXT:    addq $40, %rsp
+; AVX512DQ-NEXT:    retq
+  %a = call <2 x i64> @llvm.llrint.v2i64.v2f128(<2 x fp128> %x)
+  ret <2 x i64> %a
+}
+declare <2 x i64> @llvm.llrint.v2i64.v2f128(<2 x fp128>)
+
+define <4 x i64> @llrint_v4i64_v4f128(<4 x fp128> %x) nounwind {
+; X86-LABEL: llrint_v4i64_v4f128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    movl 8(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    pushl 24(%ebp)
+; X86-NEXT:    pushl 20(%ebp)
+; X86-NEXT:    pushl 16(%ebp)
+; X86-NEXT:    pushl 12(%ebp)
+; X86-NEXT:    calll llrintl
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl 32(%ebp)
+; X86-NEXT:    pushl 28(%ebp)
+; X86-NEXT:    calll llrintl
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    pushl 56(%ebp)
+; X86-NEXT:    pushl 52(%ebp)
+; X86-NEXT:    pushl 48(%ebp)
+; X86-NEXT:    pushl 44(%ebp)
+; X86-NEXT:    calll llrintl
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    pushl 72(%ebp)
+; X86-NEXT:    pushl 68(%ebp)
+; X86-NEXT:    pushl 64(%ebp)
+; X86-NEXT:    pushl 60(%ebp)
+; X86-NEXT:    calll llrintl
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %edx, 28(%esi)
+; X86-NEXT:    movl %eax, 24(%esi)
+; X86-NEXT:    movl %ebx, 20(%esi)
+; X86-NEXT:    movl %edi, 16(%esi)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 12(%esi)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 8(%esi)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 4(%esi)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
+; SSE-LABEL: llrint_v4i64_v4f128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    subq $72, %rsp
+; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    callq llrintl@PLT
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq llrintl@PLT
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0]
+; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq llrintl@PLT
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq llrintl@PLT
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm1 = xmm1[0],mem[0]
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    addq $72, %rsp
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: llrint_v4i64_v4f128:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    subq $72, %rsp
+; AVX1-NEXT:    vmovaps %xmm2, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps %xmm3, %xmm0
+; AVX1-NEXT:    callq llrintl@PLT
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq llrintl@PLT
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq llrintl@PLT
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq llrintl@PLT
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX1-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX1-NEXT:    addq $72, %rsp
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: llrint_v4i64_v4f128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    subq $72, %rsp
+; AVX512-NEXT:    vmovaps %xmm2, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovaps %xmm3, %xmm0
+; AVX512-NEXT:    callq llrintl@PLT
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    callq llrintl@PLT
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    callq llrintl@PLT
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    callq llrintl@PLT
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512-NEXT:    vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512-NEXT:    addq $72, %rsp
+; AVX512-NEXT:    retq
+;
+; AVX512DQ-LABEL: llrint_v4i64_v4f128:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    subq $72, %rsp
+; AVX512DQ-NEXT:    vmovaps %xmm2, (%rsp) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovaps %xmm3, %xmm0
+; AVX512DQ-NEXT:    callq llrintl@PLT
+; AVX512DQ-NEXT:    vmovq %rax, %xmm0
+; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT:    callq llrintl@PLT
+; AVX512DQ-NEXT:    vmovq %rax, %xmm0
+; AVX512DQ-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512DQ-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT:    callq llrintl@PLT
+; AVX512DQ-NEXT:    vmovq %rax, %xmm0
+; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT:    callq llrintl@PLT
+; AVX512DQ-NEXT:    vmovq %rax, %xmm0
+; AVX512DQ-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512DQ-NEXT:    vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    addq $72, %rsp
+; AVX512DQ-NEXT:    retq
+  %a = call <4 x i64> @llvm.llrint.v4i64.v4f128(<4 x fp128> %x)
+  ret <4 x i64> %a
+}
+declare <4 x i64> @llvm.llrint.v4i64.v4f128(<4 x fp128>)
+
+define <8 x i64> @llrint_v8i64_v8f128(<8 x fp128> %x) nounwind {
+; X86-LABEL: llrint_v8i64_v8f128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    movl 8(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    pushl 24(%ebp)
+; X86-NEXT:    pushl 20(%ebp)
+; X86-NEXT:    pushl 16(%ebp)
+; X86-NEXT:    pushl 12(%ebp)
+; X86-NEXT:    calll llrintl
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl 32(%ebp)
+; X86-NEXT:    pushl 28(%ebp)
+; X86-NEXT:    calll llrintl
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    pushl 56(%ebp)
+; X86-NEXT:    pushl 52(%ebp)
+; X86-NEXT:    pushl 48(%ebp)
+; X86-NEXT:    pushl 44(%ebp)
+; X86-NEXT:    calll llrintl
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    pushl 72(%ebp)
+; X86-NEXT:    pushl 68(%ebp)
+; X86-NEXT:    pushl 64(%ebp)
+; X86-NEXT:    pushl 60(%ebp)
+; X86-NEXT:    calll llrintl
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    pushl 88(%ebp)
+; X86-NEXT:    pushl 84(%ebp)
+; X86-NEXT:    pushl 80(%ebp)
+; X86-NEXT:    pushl 76(%ebp)
+; X86-NEXT:    calll llrintl
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    pushl 104(%ebp)
+; X86-NEXT:    pushl 100(%ebp)
+; X86-NEXT:    pushl 96(%ebp)
+; X86-NEXT:    pushl 92(%ebp)
+; X86-NEXT:    calll llrintl
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    pushl 120(%ebp)
+; X86-NEXT:    pushl 116(%ebp)
+; X86-NEXT:    pushl 112(%ebp)
+; X86-NEXT:    pushl 108(%ebp)
+; X86-NEXT:    calll llrintl
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    pushl 136(%ebp)
+; X86-NEXT:    pushl 132(%ebp)
+; X86-NEXT:    pushl 128(%ebp)
+; X86-NEXT:    pushl 124(%ebp)
+; X86-NEXT:    calll llrintl
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %edx, 60(%esi)
+; X86-NEXT:    movl %eax, 56(%esi)
+; X86-NEXT:    movl %ebx, 52(%esi)
+; X86-NEXT:    movl %edi, 48(%esi)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 44(%esi)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 40(%esi)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 36(%esi)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 32(%esi)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 28(%esi)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 24(%esi)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 20(%esi)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 16(%esi)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 12(%esi)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 8(%esi)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 4(%esi)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
+; SSE-LABEL: llrint_v8i64_v8f128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    subq $136, %rsp
+; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm3, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    callq llrintl@PLT
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq llrintl@PLT
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0]
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq llrintl@PLT
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq llrintl@PLT
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0]
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq llrintl@PLT
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq llrintl@PLT
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0]
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq llrintl@PLT
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq llrintl@PLT
+; SSE-NEXT:    movq %rax, %xmm3
+; SSE-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm3 = xmm3[0],mem[0]
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE-NEXT:    addq $136, %rsp
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: llrint_v8i64_v8f128:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    subq $152, %rsp
+; AVX1-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps %xmm2, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps %xmm3, %xmm0
+; AVX1-NEXT:    callq llrintl@PLT
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq llrintl@PLT
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq llrintl@PLT
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq llrintl@PLT
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX1-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq llrintl@PLT
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq llrintl@PLT
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq llrintl@PLT
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq llrintl@PLT
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX1-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    addq $152, %rsp
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: llrint_v8i64_v8f128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    subq $152, %rsp
+; AVX512-NEXT:    vmovaps %xmm6, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovaps %xmm7, %xmm0
+; AVX512-NEXT:    callq llrintl@PLT
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    callq llrintl@PLT
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    callq llrintl@PLT
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    callq llrintl@PLT
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512-NEXT:    vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq llrintl@PLT
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    callq llrintl@PLT
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    callq llrintl@PLT
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    callq llrintl@PLT
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-NEXT:    addq $152, %rsp
+; AVX512-NEXT:    retq
+;
+; AVX512DQ-LABEL: llrint_v8i64_v8f128:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    subq $152, %rsp
+; AVX512DQ-NEXT:    vmovaps %xmm6, (%rsp) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovaps %xmm7, %xmm0
+; AVX512DQ-NEXT:    callq llrintl@PLT
+; AVX512DQ-NEXT:    vmovq %rax, %xmm0
+; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT:    callq llrintl@PLT
+; AVX512DQ-NEXT:    vmovq %rax, %xmm0
+; AVX512DQ-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512DQ-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT:    callq llrintl@PLT
+; AVX512DQ-NEXT:    vmovq %rax, %xmm0
+; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT:    callq llrintl@PLT
+; AVX512DQ-NEXT:    vmovq %rax, %xmm0
+; AVX512DQ-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512DQ-NEXT:    vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    callq llrintl@PLT
+; AVX512DQ-NEXT:    vmovq %rax, %xmm0
+; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT:    callq llrintl@PLT
+; AVX512DQ-NEXT:    vmovq %rax, %xmm0
+; AVX512DQ-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT:    callq llrintl@PLT
+; AVX512DQ-NEXT:    vmovq %rax, %xmm0
+; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT:    callq llrintl@PLT
+; AVX512DQ-NEXT:    vmovq %rax, %xmm0
+; AVX512DQ-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512DQ-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT:    addq $152, %rsp
+; AVX512DQ-NEXT:    retq
+  %a = call <8 x i64> @llvm.llrint.v8i64.v8f128(<8 x fp128> %x)
+  ret <8 x i64> %a
+}
+declare <8 x i64> @llvm.llrint.v8i64.v8f128(<8 x fp128>)
diff --git a/llvm/test/CodeGen/X86/vector-lrint-f16.ll b/llvm/test/CodeGen/X86/vector-lrint-f16.ll
index 1316f80..fa3aeb0 100644
--- a/llvm/test/CodeGen/X86/vector-lrint-f16.ll
+++ b/llvm/test/CodeGen/X86/vector-lrint-f16.ll
@@ -8,7 +8,7 @@
 ; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=x86_64-unknown -mattr=avx2,f16c | FileCheck %s --check-prefixes=X64-AVX-I32
 ; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=x86_64-unknown -mattr=avx512fp16,avx512vl | FileCheck %s --check-prefixes=X64-FP16-I32
 
-define <1 x iXLen> @lrint_v1f16(<1 x half> %x) {
+define <1 x iXLen> @lrint_v1f16(<1 x half> %x) nounwind {
 ; X86-AVX-I16-LABEL: lrint_v1f16:
 ; X86-AVX-I16:       # %bb.0:
 ; X86-AVX-I16-NEXT:    vcvtph2ps %xmm0, %xmm0
@@ -73,7 +73,7 @@ define <1 x iXLen> @lrint_v1f16(<1 x half> %x) {
 }
 declare <1 x iXLen> @llvm.lrint.v1iXLen.v1f16(<1 x half>)
 
-define <2 x iXLen> @lrint_v2f16(<2 x half> %x) {
+define <2 x iXLen> @lrint_v2f16(<2 x half> %x) nounwind {
 ; X86-AVX-I16-LABEL: lrint_v2f16:
 ; X86-AVX-I16:       # %bb.0:
 ; X86-AVX-I16-NEXT:    vpsrld $16, %xmm0, %xmm1
@@ -250,7 +250,7 @@ define <2 x iXLen> @lrint_v2f16(<2 x half> %x) {
 }
 declare <2 x iXLen> @llvm.lrint.v2iXLen.v2f16(<2 x half>)
 
-define <4 x iXLen> @lrint_v4f16(<4 x half> %x) {
+define <4 x iXLen> @lrint_v4f16(<4 x half> %x) nounwind {
 ; X86-AVX-I16-LABEL: lrint_v4f16:
 ; X86-AVX-I16:       # %bb.0:
 ; X86-AVX-I16-NEXT:    vpsrld $16, %xmm0, %xmm1
@@ -455,7 +455,7 @@ define <4 x iXLen> @lrint_v4f16(<4 x half> %x) {
 }
 declare <4 x iXLen> @llvm.lrint.v4iXLen.v4f16(<4 x half>)
 
-define <8 x iXLen> @lrint_v8f16(<8 x half> %x) {
+define <8 x iXLen> @lrint_v8f16(<8 x half> %x) nounwind {
 ; X86-AVX-I16-LABEL: lrint_v8f16:
 ; X86-AVX-I16:       # %bb.0:
 ; X86-AVX-I16-NEXT:    vpsrld $16, %xmm0, %xmm1
@@ -718,7 +718,7 @@ define <8 x iXLen> @lrint_v8f16(<8 x half> %x) {
 }
 declare <8 x iXLen> @llvm.lrint.v8iXLen.v8f16(<8 x half>)
 
-define <16 x iXLen> @lrint_v16f16(<16 x half> %x) {
+define <16 x iXLen> @lrint_v16f16(<16 x half> %x) nounwind {
 ; X86-AVX-I16-LABEL: lrint_v16f16:
 ; X86-AVX-I16:       # %bb.0:
 ; X86-AVX-I16-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -1211,7 +1211,7 @@ define <16 x iXLen> @lrint_v16f16(<16 x half> %x) {
 }
 declare <16 x iXLen> @llvm.lrint.v16iXLen.v16f16(<16 x half>)
 
-define <32 x iXLen> @lrint_v32f32(<32 x half> %x) {
+define <32 x iXLen> @lrint_v32f32(<32 x half> %x) nounwind {
 ; X86-AVX-I16-LABEL: lrint_v32f32:
 ; X86-AVX-I16:       # %bb.0:
 ; X86-AVX-I16-NEXT:    vextracti128 $1, %ymm0, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-lrint.ll b/llvm/test/CodeGen/X86/vector-lrint.ll
index b1c8d46..b3e5a09 100644
--- a/llvm/test/CodeGen/X86/vector-lrint.ll
+++ b/llvm/test/CodeGen/X86/vector-lrint.ll
@@ -1,4 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=i686-unknown | FileCheck %s --check-prefix=X86-I32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=i686-unknown | FileCheck %s --check-prefix=X86-I64
 ; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=i686-unknown -mattr=sse2 | FileCheck %s --check-prefix=X86-SSE2
 ; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=i686-unknown -mattr=avx | FileCheck %s --check-prefixes=X86-AVX,X86-AVX1
 ; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=i686-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X86-AVX,AVX512-i32
@@ -10,7 +12,30 @@
 ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X64-AVX-i64,AVX512-i64
 ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=x86_64-unknown -mattr=avx512dq,avx512vl | FileCheck %s --check-prefixes=X64-AVX-i64,AVX512DQ-i64
 
-define <1 x iXLen> @lrint_v1f32(<1 x float> %x) {
+define <1 x iXLen> @lrint_v1f32(<1 x float> %x) nounwind {
+; X86-I32-LABEL: lrint_v1f32:
+; X86-I32:       # %bb.0:
+; X86-I32-NEXT:    pushl %eax
+; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl (%esp)
+; X86-I32-NEXT:    movl (%esp), %eax
+; X86-I32-NEXT:    popl %ecx
+; X86-I32-NEXT:    retl
+;
+; X86-I64-LABEL: lrint_v1f32:
+; X86-I64:       # %bb.0:
+; X86-I64-NEXT:    pushl %ebp
+; X86-I64-NEXT:    movl %esp, %ebp
+; X86-I64-NEXT:    andl $-8, %esp
+; X86-I64-NEXT:    subl $8, %esp
+; X86-I64-NEXT:    flds 8(%ebp)
+; X86-I64-NEXT:    fistpll (%esp)
+; X86-I64-NEXT:    movl (%esp), %eax
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I64-NEXT:    movl %ebp, %esp
+; X86-I64-NEXT:    popl %ebp
+; X86-I64-NEXT:    retl
+;
 ; X86-SSE2-LABEL: lrint_v1f32:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    cvtss2si {{[0-9]+}}(%esp), %eax
@@ -35,7 +60,46 @@ define <1 x iXLen> @lrint_v1f32(<1 x float> %x) {
 }
 declare <1 x iXLen> @llvm.lrint.v1iXLen.v1f32(<1 x float>)
 
-define <2 x iXLen> @lrint_v2f32(<2 x float> %x) {
+define <2 x iXLen> @lrint_v2f32(<2 x float> %x) nounwind {
+; X86-I32-LABEL: lrint_v2f32:
+; X86-I32:       # %bb.0:
+; X86-I32-NEXT:    subl $8, %esp
+; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl (%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl (%esp), %eax
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I32-NEXT:    addl $8, %esp
+; X86-I32-NEXT:    retl
+;
+; X86-I64-LABEL: lrint_v2f32:
+; X86-I64:       # %bb.0:
+; X86-I64-NEXT:    pushl %ebp
+; X86-I64-NEXT:    movl %esp, %ebp
+; X86-I64-NEXT:    pushl %edi
+; X86-I64-NEXT:    pushl %esi
+; X86-I64-NEXT:    andl $-8, %esp
+; X86-I64-NEXT:    subl $16, %esp
+; X86-I64-NEXT:    movl 8(%ebp), %eax
+; X86-I64-NEXT:    flds 16(%ebp)
+; X86-I64-NEXT:    flds 12(%ebp)
+; X86-I64-NEXT:    fistpll (%esp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl (%esp), %ecx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-I64-NEXT:    movl %edi, 12(%eax)
+; X86-I64-NEXT:    movl %esi, 8(%eax)
+; X86-I64-NEXT:    movl %edx, 4(%eax)
+; X86-I64-NEXT:    movl %ecx, (%eax)
+; X86-I64-NEXT:    leal -8(%ebp), %esp
+; X86-I64-NEXT:    popl %esi
+; X86-I64-NEXT:    popl %edi
+; X86-I64-NEXT:    popl %ebp
+; X86-I64-NEXT:    retl $4
+;
 ; X86-SSE2-LABEL: lrint_v2f32:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    cvtps2dq %xmm0, %xmm0
@@ -80,7 +144,81 @@ define <2 x iXLen> @lrint_v2f32(<2 x float> %x) {
 }
 declare <2 x iXLen> @llvm.lrint.v2iXLen.v2f32(<2 x float>)
 
-define <4 x iXLen> @lrint_v4f32(<4 x float> %x) {
+define <4 x iXLen> @lrint_v4f32(<4 x float> %x) nounwind {
+; X86-I32-LABEL: lrint_v4f32:
+; X86-I32:       # %bb.0:
+; X86-I32-NEXT:    pushl %edi
+; X86-I32-NEXT:    pushl %esi
+; X86-I32-NEXT:    subl $16, %esp
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl (%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl (%esp), %ecx
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-I32-NEXT:    movl %edi, 12(%eax)
+; X86-I32-NEXT:    movl %esi, 8(%eax)
+; X86-I32-NEXT:    movl %edx, 4(%eax)
+; X86-I32-NEXT:    movl %ecx, (%eax)
+; X86-I32-NEXT:    addl $16, %esp
+; X86-I32-NEXT:    popl %esi
+; X86-I32-NEXT:    popl %edi
+; X86-I32-NEXT:    retl $4
+;
+; X86-I64-LABEL: lrint_v4f32:
+; X86-I64:       # %bb.0:
+; X86-I64-NEXT:    pushl %ebp
+; X86-I64-NEXT:    movl %esp, %ebp
+; X86-I64-NEXT:    pushl %ebx
+; X86-I64-NEXT:    pushl %edi
+; X86-I64-NEXT:    pushl %esi
+; X86-I64-NEXT:    andl $-8, %esp
+; X86-I64-NEXT:    subl $56, %esp
+; X86-I64-NEXT:    movl 8(%ebp), %eax
+; X86-I64-NEXT:    flds 24(%ebp)
+; X86-I64-NEXT:    flds 20(%ebp)
+; X86-I64-NEXT:    flds 16(%ebp)
+; X86-I64-NEXT:    flds 12(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-I64-NEXT:    movl %esi, 28(%eax)
+; X86-I64-NEXT:    movl %ecx, 24(%eax)
+; X86-I64-NEXT:    movl %edx, 20(%eax)
+; X86-I64-NEXT:    movl %ebx, 16(%eax)
+; X86-I64-NEXT:    movl %edi, 12(%eax)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, 8(%eax)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, 4(%eax)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, (%eax)
+; X86-I64-NEXT:    leal -12(%ebp), %esp
+; X86-I64-NEXT:    popl %esi
+; X86-I64-NEXT:    popl %edi
+; X86-I64-NEXT:    popl %ebx
+; X86-I64-NEXT:    popl %ebp
+; X86-I64-NEXT:    retl $4
+;
 ; X86-SSE2-LABEL: lrint_v4f32:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    cvtps2dq %xmm0, %xmm0
@@ -141,7 +279,145 @@ define <4 x iXLen> @lrint_v4f32(<4 x float> %x) {
 }
 declare <4 x iXLen> @llvm.lrint.v4iXLen.v4f32(<4 x float>)
 
-define <8 x iXLen> @lrint_v8f32(<8 x float> %x) {
+define <8 x iXLen> @lrint_v8f32(<8 x float> %x) nounwind {
+; X86-I32-LABEL: lrint_v8f32:
+; X86-I32:       # %bb.0:
+; X86-I32-NEXT:    pushl %ebp
+; X86-I32-NEXT:    pushl %ebx
+; X86-I32-NEXT:    pushl %edi
+; X86-I32-NEXT:    pushl %esi
+; X86-I32-NEXT:    subl $40, %esp
+; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I32-NEXT:    movl %edx, 28(%eax)
+; X86-I32-NEXT:    movl %ecx, 24(%eax)
+; X86-I32-NEXT:    movl %ebp, 20(%eax)
+; X86-I32-NEXT:    movl %ebx, 16(%eax)
+; X86-I32-NEXT:    movl %edi, 12(%eax)
+; X86-I32-NEXT:    movl %esi, 8(%eax)
+; X86-I32-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-I32-NEXT:    movl %ecx, 4(%eax)
+; X86-I32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I32-NEXT:    movl %ecx, (%eax)
+; X86-I32-NEXT:    addl $40, %esp
+; X86-I32-NEXT:    popl %esi
+; X86-I32-NEXT:    popl %edi
+; X86-I32-NEXT:    popl %ebx
+; X86-I32-NEXT:    popl %ebp
+; X86-I32-NEXT:    retl $4
+;
+; X86-I64-LABEL: lrint_v8f32:
+; X86-I64:       # %bb.0:
+; X86-I64-NEXT:    pushl %ebp
+; X86-I64-NEXT:    movl %esp, %ebp
+; X86-I64-NEXT:    pushl %ebx
+; X86-I64-NEXT:    pushl %edi
+; X86-I64-NEXT:    pushl %esi
+; X86-I64-NEXT:    andl $-8, %esp
+; X86-I64-NEXT:    subl $120, %esp
+; X86-I64-NEXT:    flds 12(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    flds 16(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    flds 20(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    flds 24(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    flds 28(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    flds 32(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    flds 36(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    flds 40(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl 8(%ebp), %eax
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-I64-NEXT:    movl %ebx, 60(%eax)
+; X86-I64-NEXT:    movl %ecx, 56(%eax)
+; X86-I64-NEXT:    movl %edx, 52(%eax)
+; X86-I64-NEXT:    movl %esi, 48(%eax)
+; X86-I64-NEXT:    movl %edi, 44(%eax)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, 40(%eax)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, 36(%eax)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, 32(%eax)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, 28(%eax)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, 24(%eax)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, 20(%eax)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, 16(%eax)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, 12(%eax)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, 8(%eax)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, 4(%eax)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, (%eax)
+; X86-I64-NEXT:    leal -12(%ebp), %esp
+; X86-I64-NEXT:    popl %esi
+; X86-I64-NEXT:    popl %edi
+; X86-I64-NEXT:    popl %ebx
+; X86-I64-NEXT:    popl %ebp
+; X86-I64-NEXT:    retl $4
+;
 ; X86-SSE2-LABEL: lrint_v8f32:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    cvtps2dq %xmm0, %xmm0
@@ -235,13 +511,36 @@ define <8 x iXLen> @lrint_v8f32(<8 x float> %x) {
 }
 declare <8 x iXLen> @llvm.lrint.v8iXLen.v8f32(<8 x float>)
 
-define <16 x iXLen> @lrint_v16iXLen_v16f32(<16 x float> %x) {
+define <16 x iXLen> @lrint_v16iXLen_v16f32(<16 x float> %x) nounwind {
   %a = call <16 x iXLen> @llvm.lrint.v16iXLen.v16f32(<16 x float> %x)
   ret <16 x iXLen> %a
 }
 declare <16 x iXLen> @llvm.lrint.v16iXLen.v16f32(<16 x float>)
 
-define <1 x iXLen> @lrint_v1f64(<1 x double> %x) {
+define <1 x iXLen> @lrint_v1f64(<1 x double> %x) nounwind {
+; X86-I32-LABEL: lrint_v1f64:
+; X86-I32:       # %bb.0:
+; X86-I32-NEXT:    pushl %eax
+; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl (%esp)
+; X86-I32-NEXT:    movl (%esp), %eax
+; X86-I32-NEXT:    popl %ecx
+; X86-I32-NEXT:    retl
+;
+; X86-I64-LABEL: lrint_v1f64:
+; X86-I64:       # %bb.0:
+; X86-I64-NEXT:    pushl %ebp
+; X86-I64-NEXT:    movl %esp, %ebp
+; X86-I64-NEXT:    andl $-8, %esp
+; X86-I64-NEXT:    subl $8, %esp
+; X86-I64-NEXT:    fldl 8(%ebp)
+; X86-I64-NEXT:    fistpll (%esp)
+; X86-I64-NEXT:    movl (%esp), %eax
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I64-NEXT:    movl %ebp, %esp
+; X86-I64-NEXT:    popl %ebp
+; X86-I64-NEXT:    retl
+;
 ; X86-SSE2-LABEL: lrint_v1f64:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    cvtsd2si {{[0-9]+}}(%esp), %eax
@@ -266,7 +565,46 @@ define <1 x iXLen> @lrint_v1f64(<1 x double> %x) {
 }
 declare <1 x iXLen> @llvm.lrint.v1iXLen.v1f64(<1 x double>)
 
-define <2 x iXLen> @lrint_v2f64(<2 x double> %x) {
+define <2 x iXLen> @lrint_v2f64(<2 x double> %x) nounwind {
+; X86-I32-LABEL: lrint_v2f64:
+; X86-I32:       # %bb.0:
+; X86-I32-NEXT:    subl $8, %esp
+; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl (%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl (%esp), %eax
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I32-NEXT:    addl $8, %esp
+; X86-I32-NEXT:    retl
+;
+; X86-I64-LABEL: lrint_v2f64:
+; X86-I64:       # %bb.0:
+; X86-I64-NEXT:    pushl %ebp
+; X86-I64-NEXT:    movl %esp, %ebp
+; X86-I64-NEXT:    pushl %edi
+; X86-I64-NEXT:    pushl %esi
+; X86-I64-NEXT:    andl $-8, %esp
+; X86-I64-NEXT:    subl $16, %esp
+; X86-I64-NEXT:    movl 8(%ebp), %eax
+; X86-I64-NEXT:    fldl 20(%ebp)
+; X86-I64-NEXT:    fldl 12(%ebp)
+; X86-I64-NEXT:    fistpll (%esp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl (%esp), %ecx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-I64-NEXT:    movl %edi, 12(%eax)
+; X86-I64-NEXT:    movl %esi, 8(%eax)
+; X86-I64-NEXT:    movl %edx, 4(%eax)
+; X86-I64-NEXT:    movl %ecx, (%eax)
+; X86-I64-NEXT:    leal -8(%ebp), %esp
+; X86-I64-NEXT:    popl %esi
+; X86-I64-NEXT:    popl %edi
+; X86-I64-NEXT:    popl %ebp
+; X86-I64-NEXT:    retl $4
+;
 ; X86-SSE2-LABEL: lrint_v2f64:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    cvtpd2dq %xmm0, %xmm0
@@ -311,7 +649,81 @@ define <2 x iXLen> @lrint_v2f64(<2 x double> %x) {
 }
 declare <2 x iXLen> @llvm.lrint.v2iXLen.v2f64(<2 x double>)
 
-define <4 x iXLen> @lrint_v4f64(<4 x double> %x) {
+define <4 x iXLen> @lrint_v4f64(<4 x double> %x) nounwind {
+; X86-I32-LABEL: lrint_v4f64:
+; X86-I32:       # %bb.0:
+; X86-I32-NEXT:    pushl %edi
+; X86-I32-NEXT:    pushl %esi
+; X86-I32-NEXT:    subl $16, %esp
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl (%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl (%esp), %ecx
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-I32-NEXT:    movl %edi, 12(%eax)
+; X86-I32-NEXT:    movl %esi, 8(%eax)
+; X86-I32-NEXT:    movl %edx, 4(%eax)
+; X86-I32-NEXT:    movl %ecx, (%eax)
+; X86-I32-NEXT:    addl $16, %esp
+; X86-I32-NEXT:    popl %esi
+; X86-I32-NEXT:    popl %edi
+; X86-I32-NEXT:    retl $4
+;
+; X86-I64-LABEL: lrint_v4f64:
+; X86-I64:       # %bb.0:
+; X86-I64-NEXT:    pushl %ebp
+; X86-I64-NEXT:    movl %esp, %ebp
+; X86-I64-NEXT:    pushl %ebx
+; X86-I64-NEXT:    pushl %edi
+; X86-I64-NEXT:    pushl %esi
+; X86-I64-NEXT:    andl $-8, %esp
+; X86-I64-NEXT:    subl $56, %esp
+; X86-I64-NEXT:    movl 8(%ebp), %eax
+; X86-I64-NEXT:    fldl 36(%ebp)
+; X86-I64-NEXT:    fldl 28(%ebp)
+; X86-I64-NEXT:    fldl 20(%ebp)
+; X86-I64-NEXT:    fldl 12(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-I64-NEXT:    movl %esi, 28(%eax)
+; X86-I64-NEXT:    movl %ecx, 24(%eax)
+; X86-I64-NEXT:    movl %edx, 20(%eax)
+; X86-I64-NEXT:    movl %ebx, 16(%eax)
+; X86-I64-NEXT:    movl %edi, 12(%eax)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, 8(%eax)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, 4(%eax)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, (%eax)
+; X86-I64-NEXT:    leal -12(%ebp), %esp
+; X86-I64-NEXT:    popl %esi
+; X86-I64-NEXT:    popl %edi
+; X86-I64-NEXT:    popl %ebx
+; X86-I64-NEXT:    popl %ebp
+; X86-I64-NEXT:    retl $4
+;
 ; X86-SSE2-LABEL: lrint_v4f64:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    cvtpd2dq %xmm1, %xmm1
@@ -376,14 +788,149 @@ define <4 x iXLen> @lrint_v4f64(<4 x double> %x) {
 }
 declare <4 x iXLen> @llvm.lrint.v4iXLen.v4f64(<4 x double>)
 
-define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
+define <8 x iXLen> @lrint_v8f64(<8 x double> %x) nounwind {
+; X86-I32-LABEL: lrint_v8f64:
+; X86-I32:       # %bb.0:
+; X86-I32-NEXT:    pushl %ebp
+; X86-I32-NEXT:    pushl %ebx
+; X86-I32-NEXT:    pushl %edi
+; X86-I32-NEXT:    pushl %esi
+; X86-I32-NEXT:    subl $40, %esp
+; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I32-NEXT:    movl %edx, 28(%eax)
+; X86-I32-NEXT:    movl %ecx, 24(%eax)
+; X86-I32-NEXT:    movl %ebp, 20(%eax)
+; X86-I32-NEXT:    movl %ebx, 16(%eax)
+; X86-I32-NEXT:    movl %edi, 12(%eax)
+; X86-I32-NEXT:    movl %esi, 8(%eax)
+; X86-I32-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-I32-NEXT:    movl %ecx, 4(%eax)
+; X86-I32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I32-NEXT:    movl %ecx, (%eax)
+; X86-I32-NEXT:    addl $40, %esp
+; X86-I32-NEXT:    popl %esi
+; X86-I32-NEXT:    popl %edi
+; X86-I32-NEXT:    popl %ebx
+; X86-I32-NEXT:    popl %ebp
+; X86-I32-NEXT:    retl $4
+;
+; X86-I64-LABEL: lrint_v8f64:
+; X86-I64:       # %bb.0:
+; X86-I64-NEXT:    pushl %ebp
+; X86-I64-NEXT:    movl %esp, %ebp
+; X86-I64-NEXT:    pushl %ebx
+; X86-I64-NEXT:    pushl %edi
+; X86-I64-NEXT:    pushl %esi
+; X86-I64-NEXT:    andl $-8, %esp
+; X86-I64-NEXT:    subl $120, %esp
+; X86-I64-NEXT:    fldl 12(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    fldl 20(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    fldl 28(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    fldl 36(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    fldl 44(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    fldl 52(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    fldl 60(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    fldl 68(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl 8(%ebp), %eax
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-I64-NEXT:    movl %ebx, 60(%eax)
+; X86-I64-NEXT:    movl %ecx, 56(%eax)
+; X86-I64-NEXT:    movl %edx, 52(%eax)
+; X86-I64-NEXT:    movl %esi, 48(%eax)
+; X86-I64-NEXT:    movl %edi, 44(%eax)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, 40(%eax)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, 36(%eax)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, 32(%eax)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, 28(%eax)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, 24(%eax)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, 20(%eax)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, 16(%eax)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, 12(%eax)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, 8(%eax)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, 4(%eax)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, (%eax)
+; X86-I64-NEXT:    leal -12(%ebp), %esp
+; X86-I64-NEXT:    popl %esi
+; X86-I64-NEXT:    popl %edi
+; X86-I64-NEXT:    popl %ebx
+; X86-I64-NEXT:    popl %ebp
+; X86-I64-NEXT:    retl $4
+;
 ; X86-SSE2-LABEL: lrint_v8f64:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pushl %ebp
-; X86-SSE2-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE2-NEXT:    .cfi_offset %ebp, -8
 ; X86-SSE2-NEXT:    movl %esp, %ebp
-; X86-SSE2-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-SSE2-NEXT:    andl $-16, %esp
 ; X86-SSE2-NEXT:    subl $16, %esp
 ; X86-SSE2-NEXT:    cvtpd2dq %xmm1, %xmm1
@@ -394,7 +941,6 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; X86-SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
-; X86-SSE2-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: lrint_v8f64:
@@ -490,3 +1036,1145 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
   ret <8 x iXLen> %a
 }
 declare <8 x iXLen> @llvm.lrint.v8iXLen.v8f64(<8 x double>)
+
+define <1 x iXLen> @lrint_v1fp128(<1 x fp128> %x) nounwind {
+; X86-I32-LABEL: lrint_v1fp128:
+; X86-I32:       # %bb.0:
+; X86-I32-NEXT:    pushl %ebp
+; X86-I32-NEXT:    movl %esp, %ebp
+; X86-I32-NEXT:    andl $-16, %esp
+; X86-I32-NEXT:    subl $16, %esp
+; X86-I32-NEXT:    pushl 20(%ebp)
+; X86-I32-NEXT:    pushl 16(%ebp)
+; X86-I32-NEXT:    pushl 12(%ebp)
+; X86-I32-NEXT:    pushl 8(%ebp)
+; X86-I32-NEXT:    calll lrintl
+; X86-I32-NEXT:    addl $16, %esp
+; X86-I32-NEXT:    movl %ebp, %esp
+; X86-I32-NEXT:    popl %ebp
+; X86-I32-NEXT:    retl
+;
+; X86-I64-LABEL: lrint_v1fp128:
+; X86-I64:       # %bb.0:
+; X86-I64-NEXT:    pushl %ebp
+; X86-I64-NEXT:    movl %esp, %ebp
+; X86-I64-NEXT:    andl $-16, %esp
+; X86-I64-NEXT:    subl $16, %esp
+; X86-I64-NEXT:    pushl 20(%ebp)
+; X86-I64-NEXT:    pushl 16(%ebp)
+; X86-I64-NEXT:    pushl 12(%ebp)
+; X86-I64-NEXT:    pushl 8(%ebp)
+; X86-I64-NEXT:    calll lrintl
+; X86-I64-NEXT:    addl $16, %esp
+; X86-I64-NEXT:    movl %ebp, %esp
+; X86-I64-NEXT:    popl %ebp
+; X86-I64-NEXT:    retl
+;
+; X86-SSE2-LABEL: lrint_v1fp128:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    pushl 20(%ebp)
+; X86-SSE2-NEXT:    pushl 16(%ebp)
+; X86-SSE2-NEXT:    pushl 12(%ebp)
+; X86-SSE2-NEXT:    pushl 8(%ebp)
+; X86-SSE2-NEXT:    calll lrintl
+; X86-SSE2-NEXT:    addl $16, %esp
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: lrint_v1fp128:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %ebp
+; X86-AVX-NEXT:    movl %esp, %ebp
+; X86-AVX-NEXT:    andl $-16, %esp
+; X86-AVX-NEXT:    subl $32, %esp
+; X86-AVX-NEXT:    vmovups 8(%ebp), %xmm0
+; X86-AVX-NEXT:    vmovups %xmm0, (%esp)
+; X86-AVX-NEXT:    calll lrintl
+; X86-AVX-NEXT:    movl %ebp, %esp
+; X86-AVX-NEXT:    popl %ebp
+; X86-AVX-NEXT:    retl
+;
+; X64-AVX-i32-LABEL: lrint_v1fp128:
+; X64-AVX-i32:       # %bb.0:
+; X64-AVX-i32-NEXT:    pushq %rax
+; X64-AVX-i32-NEXT:    callq lrintl@PLT
+; X64-AVX-i32-NEXT:    popq %rcx
+; X64-AVX-i32-NEXT:    retq
+;
+; X64-AVX-i64-LABEL: lrint_v1fp128:
+; X64-AVX-i64:       # %bb.0:
+; X64-AVX-i64-NEXT:    pushq %rax
+; X64-AVX-i64-NEXT:    callq lrintl@PLT
+; X64-AVX-i64-NEXT:    popq %rcx
+; X64-AVX-i64-NEXT:    retq
+  %a = call <1 x iXLen> @llvm.lrint.v1iXLen.v1fp128(<1 x fp128> %x)
+  ret <1 x iXLen> %a
+}
+declare <1 x iXLen> @llvm.lrint.v1iXLen.v1fp128(<1 x fp128>)
+
+define <2 x iXLen> @lrint_v2fp128(<2 x fp128> %x) nounwind {
+; X86-I32-LABEL: lrint_v2fp128:
+; X86-I32:       # %bb.0:
+; X86-I32-NEXT:    pushl %ebp
+; X86-I32-NEXT:    movl %esp, %ebp
+; X86-I32-NEXT:    pushl %ebx
+; X86-I32-NEXT:    pushl %edi
+; X86-I32-NEXT:    pushl %esi
+; X86-I32-NEXT:    andl $-16, %esp
+; X86-I32-NEXT:    subl $16, %esp
+; X86-I32-NEXT:    movl 32(%ebp), %edi
+; X86-I32-NEXT:    movl 36(%ebp), %ebx
+; X86-I32-NEXT:    pushl 20(%ebp)
+; X86-I32-NEXT:    pushl 16(%ebp)
+; X86-I32-NEXT:    pushl 12(%ebp)
+; X86-I32-NEXT:    pushl 8(%ebp)
+; X86-I32-NEXT:    calll lrintl
+; X86-I32-NEXT:    addl $16, %esp
+; X86-I32-NEXT:    movl %eax, %esi
+; X86-I32-NEXT:    pushl %ebx
+; X86-I32-NEXT:    pushl %edi
+; X86-I32-NEXT:    pushl 28(%ebp)
+; X86-I32-NEXT:    pushl 24(%ebp)
+; X86-I32-NEXT:    calll lrintl
+; X86-I32-NEXT:    addl $16, %esp
+; X86-I32-NEXT:    movl %eax, %edx
+; X86-I32-NEXT:    movl %esi, %eax
+; X86-I32-NEXT:    leal -12(%ebp), %esp
+; X86-I32-NEXT:    popl %esi
+; X86-I32-NEXT:    popl %edi
+; X86-I32-NEXT:    popl %ebx
+; X86-I32-NEXT:    popl %ebp
+; X86-I32-NEXT:    retl
+;
+; X86-I64-LABEL: lrint_v2fp128:
+; X86-I64:       # %bb.0:
+; X86-I64-NEXT:    pushl %ebp
+; X86-I64-NEXT:    movl %esp, %ebp
+; X86-I64-NEXT:    pushl %ebx
+; X86-I64-NEXT:    pushl %edi
+; X86-I64-NEXT:    pushl %esi
+; X86-I64-NEXT:    andl $-16, %esp
+; X86-I64-NEXT:    subl $16, %esp
+; X86-I64-NEXT:    movl 8(%ebp), %esi
+; X86-I64-NEXT:    pushl 24(%ebp)
+; X86-I64-NEXT:    pushl 20(%ebp)
+; X86-I64-NEXT:    pushl 16(%ebp)
+; X86-I64-NEXT:    pushl 12(%ebp)
+; X86-I64-NEXT:    calll lrintl
+; X86-I64-NEXT:    addl $16, %esp
+; X86-I64-NEXT:    movl %eax, %edi
+; X86-I64-NEXT:    movl %edx, %ebx
+; X86-I64-NEXT:    pushl 40(%ebp)
+; X86-I64-NEXT:    pushl 36(%ebp)
+; X86-I64-NEXT:    pushl 32(%ebp)
+; X86-I64-NEXT:    pushl 28(%ebp)
+; X86-I64-NEXT:    calll lrintl
+; X86-I64-NEXT:    addl $16, %esp
+; X86-I64-NEXT:    movl %edx, 12(%esi)
+; X86-I64-NEXT:    movl %eax, 8(%esi)
+; X86-I64-NEXT:    movl %ebx, 4(%esi)
+; X86-I64-NEXT:    movl %edi, (%esi)
+; X86-I64-NEXT:    movl %esi, %eax
+; X86-I64-NEXT:    leal -12(%ebp), %esp
+; X86-I64-NEXT:    popl %esi
+; X86-I64-NEXT:    popl %edi
+; X86-I64-NEXT:    popl %ebx
+; X86-I64-NEXT:    popl %ebp
+; X86-I64-NEXT:    retl $4
+;
+; X86-SSE2-LABEL: lrint_v2fp128:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    pushl %ebx
+; X86-SSE2-NEXT:    pushl %edi
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $32, %esp
+; X86-SSE2-NEXT:    movl 12(%ebp), %edi
+; X86-SSE2-NEXT:    movl 16(%ebp), %ebx
+; X86-SSE2-NEXT:    movl 20(%ebp), %esi
+; X86-SSE2-NEXT:    pushl 36(%ebp)
+; X86-SSE2-NEXT:    pushl 32(%ebp)
+; X86-SSE2-NEXT:    pushl 28(%ebp)
+; X86-SSE2-NEXT:    pushl 24(%ebp)
+; X86-SSE2-NEXT:    calll lrintl
+; X86-SSE2-NEXT:    addl $16, %esp
+; X86-SSE2-NEXT:    movd %eax, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, (%esp) # 16-byte Spill
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    pushl %ebx
+; X86-SSE2-NEXT:    pushl %edi
+; X86-SSE2-NEXT:    pushl 8(%ebp)
+; X86-SSE2-NEXT:    calll lrintl
+; X86-SSE2-NEXT:    addl $16, %esp
+; X86-SSE2-NEXT:    movd %eax, %xmm0
+; X86-SSE2-NEXT:    punpckldq (%esp), %xmm0 # 16-byte Folded Reload
+; X86-SSE2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; X86-SSE2-NEXT:    leal -12(%ebp), %esp
+; X86-SSE2-NEXT:    popl %esi
+; X86-SSE2-NEXT:    popl %edi
+; X86-SSE2-NEXT:    popl %ebx
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: lrint_v2fp128:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %ebp
+; X86-AVX-NEXT:    movl %esp, %ebp
+; X86-AVX-NEXT:    andl $-16, %esp
+; X86-AVX-NEXT:    subl $48, %esp
+; X86-AVX-NEXT:    vmovups 8(%ebp), %xmm0
+; X86-AVX-NEXT:    vmovups %xmm0, (%esp)
+; X86-AVX-NEXT:    calll lrintl
+; X86-AVX-NEXT:    vmovups 24(%ebp), %xmm0
+; X86-AVX-NEXT:    vmovups %xmm0, (%esp)
+; X86-AVX-NEXT:    vmovd %eax, %xmm0
+; X86-AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-AVX-NEXT:    calll lrintl
+; X86-AVX-NEXT:    vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-AVX-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
+; X86-AVX-NEXT:    movl %ebp, %esp
+; X86-AVX-NEXT:    popl %ebp
+; X86-AVX-NEXT:    retl
+;
+; X64-AVX-i32-LABEL: lrint_v2fp128:
+; X64-AVX-i32:       # %bb.0:
+; X64-AVX-i32-NEXT:    pushq %rbx
+; X64-AVX-i32-NEXT:    subq $16, %rsp
+; X64-AVX-i32-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; X64-AVX-i32-NEXT:    vmovaps %xmm1, %xmm0
+; X64-AVX-i32-NEXT:    callq lrintl@PLT
+; X64-AVX-i32-NEXT:    movl %eax, %ebx
+; X64-AVX-i32-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; X64-AVX-i32-NEXT:    callq lrintl@PLT
+; X64-AVX-i32-NEXT:    vmovd %eax, %xmm0
+; X64-AVX-i32-NEXT:    vpinsrd $1, %ebx, %xmm0, %xmm0
+; X64-AVX-i32-NEXT:    addq $16, %rsp
+; X64-AVX-i32-NEXT:    popq %rbx
+; X64-AVX-i32-NEXT:    retq
+;
+; X64-AVX-i64-LABEL: lrint_v2fp128:
+; X64-AVX-i64:       # %bb.0:
+; X64-AVX-i64-NEXT:    subq $40, %rsp
+; X64-AVX-i64-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX-i64-NEXT:    vmovaps %xmm1, %xmm0
+; X64-AVX-i64-NEXT:    callq lrintl@PLT
+; X64-AVX-i64-NEXT:    vmovq %rax, %xmm0
+; X64-AVX-i64-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; X64-AVX-i64-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-AVX-i64-NEXT:    callq lrintl@PLT
+; X64-AVX-i64-NEXT:    vmovq %rax, %xmm0
+; X64-AVX-i64-NEXT:    vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; X64-AVX-i64-NEXT:    # xmm0 = xmm0[0],mem[0]
+; X64-AVX-i64-NEXT:    addq $40, %rsp
+; X64-AVX-i64-NEXT:    retq
+  %a = call <2 x iXLen> @llvm.lrint.v2iXLen.v2fp128(<2 x fp128> %x)
+  ret <2 x iXLen> %a
+}
+declare <2 x iXLen> @llvm.lrint.v2iXLen.v2fp128(<2 x fp128>)
+
+define <4 x iXLen> @lrint_v4fp128(<4 x fp128> %x) nounwind {
+; X86-I32-LABEL: lrint_v4fp128:
+; X86-I32:       # %bb.0:
+; X86-I32-NEXT:    pushl %ebp
+; X86-I32-NEXT:    movl %esp, %ebp
+; X86-I32-NEXT:    pushl %ebx
+; X86-I32-NEXT:    pushl %edi
+; X86-I32-NEXT:    pushl %esi
+; X86-I32-NEXT:    andl $-16, %esp
+; X86-I32-NEXT:    subl $16, %esp
+; X86-I32-NEXT:    movl 8(%ebp), %esi
+; X86-I32-NEXT:    movl 36(%ebp), %ebx
+; X86-I32-NEXT:    movl 40(%ebp), %edi
+; X86-I32-NEXT:    pushl 24(%ebp)
+; X86-I32-NEXT:    pushl 20(%ebp)
+; X86-I32-NEXT:    pushl 16(%ebp)
+; X86-I32-NEXT:    pushl 12(%ebp)
+; X86-I32-NEXT:    calll lrintl
+; X86-I32-NEXT:    addl $16, %esp
+; X86-I32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I32-NEXT:    pushl %edi
+; X86-I32-NEXT:    pushl %ebx
+; X86-I32-NEXT:    pushl 32(%ebp)
+; X86-I32-NEXT:    pushl 28(%ebp)
+; X86-I32-NEXT:    calll lrintl
+; X86-I32-NEXT:    addl $16, %esp
+; X86-I32-NEXT:    movl %eax, %ebx
+; X86-I32-NEXT:    pushl 56(%ebp)
+; X86-I32-NEXT:    pushl 52(%ebp)
+; X86-I32-NEXT:    pushl 48(%ebp)
+; X86-I32-NEXT:    pushl 44(%ebp)
+; X86-I32-NEXT:    calll lrintl
+; X86-I32-NEXT:    addl $16, %esp
+; X86-I32-NEXT:    movl %eax, %edi
+; X86-I32-NEXT:    pushl 72(%ebp)
+; X86-I32-NEXT:    pushl 68(%ebp)
+; X86-I32-NEXT:    pushl 64(%ebp)
+; X86-I32-NEXT:    pushl 60(%ebp)
+; X86-I32-NEXT:    calll lrintl
+; X86-I32-NEXT:    addl $16, %esp
+; X86-I32-NEXT:    movl %eax, 12(%esi)
+; X86-I32-NEXT:    movl %edi, 8(%esi)
+; X86-I32-NEXT:    movl %ebx, 4(%esi)
+; X86-I32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-I32-NEXT:    movl %eax, (%esi)
+; X86-I32-NEXT:    movl %esi, %eax
+; X86-I32-NEXT:    leal -12(%ebp), %esp
+; X86-I32-NEXT:    popl %esi
+; X86-I32-NEXT:    popl %edi
+; X86-I32-NEXT:    popl %ebx
+; X86-I32-NEXT:    popl %ebp
+; X86-I32-NEXT:    retl $4
+;
+; X86-I64-LABEL: lrint_v4fp128:
+; X86-I64:       # %bb.0:
+; X86-I64-NEXT:    pushl %ebp
+; X86-I64-NEXT:    movl %esp, %ebp
+; X86-I64-NEXT:    pushl %ebx
+; X86-I64-NEXT:    pushl %edi
+; X86-I64-NEXT:    pushl %esi
+; X86-I64-NEXT:    andl $-16, %esp
+; X86-I64-NEXT:    subl $32, %esp
+; X86-I64-NEXT:    movl 8(%ebp), %esi
+; X86-I64-NEXT:    movl 36(%ebp), %edi
+; X86-I64-NEXT:    movl 40(%ebp), %ebx
+; X86-I64-NEXT:    pushl 24(%ebp)
+; X86-I64-NEXT:    pushl 20(%ebp)
+; X86-I64-NEXT:    pushl 16(%ebp)
+; X86-I64-NEXT:    pushl 12(%ebp)
+; X86-I64-NEXT:    calll lrintl
+; X86-I64-NEXT:    addl $16, %esp
+; X86-I64-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    pushl %ebx
+; X86-I64-NEXT:    pushl %edi
+; X86-I64-NEXT:    pushl 32(%ebp)
+; X86-I64-NEXT:    pushl 28(%ebp)
+; X86-I64-NEXT:    calll lrintl
+; X86-I64-NEXT:    addl $16, %esp
+; X86-I64-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    pushl 56(%ebp)
+; X86-I64-NEXT:    pushl 52(%ebp)
+; X86-I64-NEXT:    pushl 48(%ebp)
+; X86-I64-NEXT:    pushl 44(%ebp)
+; X86-I64-NEXT:    calll lrintl
+; X86-I64-NEXT:    addl $16, %esp
+; X86-I64-NEXT:    movl %eax, %edi
+; X86-I64-NEXT:    movl %edx, %ebx
+; X86-I64-NEXT:    pushl 72(%ebp)
+; X86-I64-NEXT:    pushl 68(%ebp)
+; X86-I64-NEXT:    pushl 64(%ebp)
+; X86-I64-NEXT:    pushl 60(%ebp)
+; X86-I64-NEXT:    calll lrintl
+; X86-I64-NEXT:    addl $16, %esp
+; X86-I64-NEXT:    movl %edx, 28(%esi)
+; X86-I64-NEXT:    movl %eax, 24(%esi)
+; X86-I64-NEXT:    movl %ebx, 20(%esi)
+; X86-I64-NEXT:    movl %edi, 16(%esi)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-I64-NEXT:    movl %eax, 12(%esi)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-I64-NEXT:    movl %eax, 8(%esi)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-I64-NEXT:    movl %eax, 4(%esi)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-I64-NEXT:    movl %eax, (%esi)
+; X86-I64-NEXT:    movl %esi, %eax
+; X86-I64-NEXT:    leal -12(%ebp), %esp
+; X86-I64-NEXT:    popl %esi
+; X86-I64-NEXT:    popl %edi
+; X86-I64-NEXT:    popl %ebx
+; X86-I64-NEXT:    popl %ebp
+; X86-I64-NEXT:    retl $4
+;
+; X86-SSE2-LABEL: lrint_v4fp128:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    pushl %ebx
+; X86-SSE2-NEXT:    pushl %edi
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $48, %esp
+; X86-SSE2-NEXT:    movl 48(%ebp), %edi
+; X86-SSE2-NEXT:    movl 52(%ebp), %ebx
+; X86-SSE2-NEXT:    pushl 36(%ebp)
+; X86-SSE2-NEXT:    pushl 32(%ebp)
+; X86-SSE2-NEXT:    pushl 28(%ebp)
+; X86-SSE2-NEXT:    pushl 24(%ebp)
+; X86-SSE2-NEXT:    calll lrintl
+; X86-SSE2-NEXT:    addl $16, %esp
+; X86-SSE2-NEXT:    movl %eax, %esi
+; X86-SSE2-NEXT:    pushl %ebx
+; X86-SSE2-NEXT:    pushl %edi
+; X86-SSE2-NEXT:    pushl 44(%ebp)
+; X86-SSE2-NEXT:    pushl 40(%ebp)
+; X86-SSE2-NEXT:    calll lrintl
+; X86-SSE2-NEXT:    addl $16, %esp
+; X86-SSE2-NEXT:    movl %eax, %edi
+; X86-SSE2-NEXT:    pushl 68(%ebp)
+; X86-SSE2-NEXT:    pushl 64(%ebp)
+; X86-SSE2-NEXT:    pushl 60(%ebp)
+; X86-SSE2-NEXT:    pushl 56(%ebp)
+; X86-SSE2-NEXT:    calll lrintl
+; X86-SSE2-NEXT:    addl $16, %esp
+; X86-SSE2-NEXT:    movd %eax, %xmm0
+; X86-SSE2-NEXT:    movd %edi, %xmm1
+; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-SSE2-NEXT:    movd %esi, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, (%esp) # 16-byte Spill
+; X86-SSE2-NEXT:    pushl 20(%ebp)
+; X86-SSE2-NEXT:    pushl 16(%ebp)
+; X86-SSE2-NEXT:    pushl 12(%ebp)
+; X86-SSE2-NEXT:    pushl 8(%ebp)
+; X86-SSE2-NEXT:    calll lrintl
+; X86-SSE2-NEXT:    addl $16, %esp
+; X86-SSE2-NEXT:    movd %eax, %xmm0
+; X86-SSE2-NEXT:    punpckldq (%esp), %xmm0 # 16-byte Folded Reload
+; X86-SSE2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; X86-SSE2-NEXT:    punpcklqdq {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; X86-SSE2-NEXT:    # xmm0 = xmm0[0],mem[0]
+; X86-SSE2-NEXT:    leal -12(%ebp), %esp
+; X86-SSE2-NEXT:    popl %esi
+; X86-SSE2-NEXT:    popl %edi
+; X86-SSE2-NEXT:    popl %ebx
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: lrint_v4fp128:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %ebp
+; X86-AVX-NEXT:    movl %esp, %ebp
+; X86-AVX-NEXT:    pushl %edi
+; X86-AVX-NEXT:    pushl %esi
+; X86-AVX-NEXT:    andl $-16, %esp
+; X86-AVX-NEXT:    subl $32, %esp
+; X86-AVX-NEXT:    vmovups 40(%ebp), %xmm0
+; X86-AVX-NEXT:    vmovups %xmm0, (%esp)
+; X86-AVX-NEXT:    calll lrintl
+; X86-AVX-NEXT:    movl %eax, %esi
+; X86-AVX-NEXT:    vmovups 24(%ebp), %xmm0
+; X86-AVX-NEXT:    vmovups %xmm0, (%esp)
+; X86-AVX-NEXT:    calll lrintl
+; X86-AVX-NEXT:    movl %eax, %edi
+; X86-AVX-NEXT:    vmovups 8(%ebp), %xmm0
+; X86-AVX-NEXT:    vmovups %xmm0, (%esp)
+; X86-AVX-NEXT:    calll lrintl
+; X86-AVX-NEXT:    vmovups 56(%ebp), %xmm0
+; X86-AVX-NEXT:    vmovups %xmm0, (%esp)
+; X86-AVX-NEXT:    vmovd %eax, %xmm0
+; X86-AVX-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpinsrd $2, %esi, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-AVX-NEXT:    calll lrintl
+; X86-AVX-NEXT:    vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-AVX-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
+; X86-AVX-NEXT:    leal -8(%ebp), %esp
+; X86-AVX-NEXT:    popl %esi
+; X86-AVX-NEXT:    popl %edi
+; X86-AVX-NEXT:    popl %ebp
+; X86-AVX-NEXT:    retl
+;
+; X64-AVX-i32-LABEL: lrint_v4fp128:
+; X64-AVX-i32:       # %bb.0:
+; X64-AVX-i32-NEXT:    pushq %rbx
+; X64-AVX-i32-NEXT:    subq $48, %rsp
+; X64-AVX-i32-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX-i32-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX-i32-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; X64-AVX-i32-NEXT:    vmovaps %xmm1, %xmm0
+; X64-AVX-i32-NEXT:    callq lrintl@PLT
+; X64-AVX-i32-NEXT:    movl %eax, %ebx
+; X64-AVX-i32-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; X64-AVX-i32-NEXT:    callq lrintl@PLT
+; X64-AVX-i32-NEXT:    vmovd %eax, %xmm0
+; X64-AVX-i32-NEXT:    vpinsrd $1, %ebx, %xmm0, %xmm0
+; X64-AVX-i32-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; X64-AVX-i32-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-AVX-i32-NEXT:    callq lrintl@PLT
+; X64-AVX-i32-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; X64-AVX-i32-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
+; X64-AVX-i32-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX-i32-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-AVX-i32-NEXT:    callq lrintl@PLT
+; X64-AVX-i32-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-AVX-i32-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
+; X64-AVX-i32-NEXT:    addq $48, %rsp
+; X64-AVX-i32-NEXT:    popq %rbx
+; X64-AVX-i32-NEXT:    retq
+;
+; X64-AVX1-i64-LABEL: lrint_v4fp128:
+; X64-AVX1-i64:       # %bb.0:
+; X64-AVX1-i64-NEXT:    subq $72, %rsp
+; X64-AVX1-i64-NEXT:    vmovaps %xmm2, (%rsp) # 16-byte Spill
+; X64-AVX1-i64-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX1-i64-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX1-i64-NEXT:    vmovaps %xmm3, %xmm0
+; X64-AVX1-i64-NEXT:    callq lrintl@PLT
+; X64-AVX1-i64-NEXT:    vmovq %rax, %xmm0
+; X64-AVX1-i64-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX1-i64-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; X64-AVX1-i64-NEXT:    callq lrintl@PLT
+; X64-AVX1-i64-NEXT:    vmovq %rax, %xmm0
+; X64-AVX1-i64-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; X64-AVX1-i64-NEXT:    # xmm0 = xmm0[0],mem[0]
+; X64-AVX1-i64-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; X64-AVX1-i64-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-AVX1-i64-NEXT:    callq lrintl@PLT
+; X64-AVX1-i64-NEXT:    vmovq %rax, %xmm0
+; X64-AVX1-i64-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX1-i64-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-AVX1-i64-NEXT:    callq lrintl@PLT
+; X64-AVX1-i64-NEXT:    vmovq %rax, %xmm0
+; X64-AVX1-i64-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; X64-AVX1-i64-NEXT:    # xmm0 = xmm0[0],mem[0]
+; X64-AVX1-i64-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
+; X64-AVX1-i64-NEXT:    addq $72, %rsp
+; X64-AVX1-i64-NEXT:    retq
+;
+; AVX512-i64-LABEL: lrint_v4fp128:
+; AVX512-i64:       # %bb.0:
+; AVX512-i64-NEXT:    subq $72, %rsp
+; AVX512-i64-NEXT:    vmovaps %xmm2, (%rsp) # 16-byte Spill
+; AVX512-i64-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-i64-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-i64-NEXT:    vmovaps %xmm3, %xmm0
+; AVX512-i64-NEXT:    callq lrintl@PLT
+; AVX512-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512-i64-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-i64-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-i64-NEXT:    callq lrintl@PLT
+; AVX512-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512-i64-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-i64-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512-i64-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512-i64-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-i64-NEXT:    callq lrintl@PLT
+; AVX512-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512-i64-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-i64-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-i64-NEXT:    callq lrintl@PLT
+; AVX512-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512-i64-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-i64-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512-i64-NEXT:    vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512-i64-NEXT:    addq $72, %rsp
+; AVX512-i64-NEXT:    retq
+;
+; AVX512DQ-i64-LABEL: lrint_v4fp128:
+; AVX512DQ-i64:       # %bb.0:
+; AVX512DQ-i64-NEXT:    subq $72, %rsp
+; AVX512DQ-i64-NEXT:    vmovaps %xmm2, (%rsp) # 16-byte Spill
+; AVX512DQ-i64-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-i64-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-i64-NEXT:    vmovaps %xmm3, %xmm0
+; AVX512DQ-i64-NEXT:    callq lrintl@PLT
+; AVX512DQ-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512DQ-i64-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-i64-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512DQ-i64-NEXT:    callq lrintl@PLT
+; AVX512DQ-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512DQ-i64-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-i64-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512DQ-i64-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512DQ-i64-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-i64-NEXT:    callq lrintl@PLT
+; AVX512DQ-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512DQ-i64-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-i64-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-i64-NEXT:    callq lrintl@PLT
+; AVX512DQ-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512DQ-i64-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-i64-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512DQ-i64-NEXT:    vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512DQ-i64-NEXT:    addq $72, %rsp
+; AVX512DQ-i64-NEXT:    retq
+  %a = call <4 x iXLen> @llvm.lrint.v4iXLen.v4fp128(<4 x fp128> %x)
+  ret <4 x iXLen> %a
+}
+declare <4 x iXLen> @llvm.lrint.v4iXLen.v4fp128(<4 x fp128>)
+
+define <8 x iXLen> @lrint_v8fp128(<8 x fp128> %x) nounwind {
+; X86-I32-LABEL: lrint_v8fp128:
+; X86-I32:       # %bb.0:
+; X86-I32-NEXT:    pushl %ebp
+; X86-I32-NEXT:    movl %esp, %ebp
+; X86-I32-NEXT:    pushl %ebx
+; X86-I32-NEXT:    pushl %edi
+; X86-I32-NEXT:    pushl %esi
+; X86-I32-NEXT:    andl $-16, %esp
+; X86-I32-NEXT:    subl $32, %esp
+; X86-I32-NEXT:    movl 8(%ebp), %esi
+; X86-I32-NEXT:    movl 36(%ebp), %ebx
+; X86-I32-NEXT:    movl 40(%ebp), %edi
+; X86-I32-NEXT:    pushl 24(%ebp)
+; X86-I32-NEXT:    pushl 20(%ebp)
+; X86-I32-NEXT:    pushl 16(%ebp)
+; X86-I32-NEXT:    pushl 12(%ebp)
+; X86-I32-NEXT:    calll lrintl
+; X86-I32-NEXT:    addl $16, %esp
+; X86-I32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I32-NEXT:    pushl %edi
+; X86-I32-NEXT:    pushl %ebx
+; X86-I32-NEXT:    pushl 32(%ebp)
+; X86-I32-NEXT:    pushl 28(%ebp)
+; X86-I32-NEXT:    calll lrintl
+; X86-I32-NEXT:    addl $16, %esp
+; X86-I32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I32-NEXT:    pushl 56(%ebp)
+; X86-I32-NEXT:    pushl 52(%ebp)
+; X86-I32-NEXT:    pushl 48(%ebp)
+; X86-I32-NEXT:    pushl 44(%ebp)
+; X86-I32-NEXT:    calll lrintl
+; X86-I32-NEXT:    addl $16, %esp
+; X86-I32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I32-NEXT:    pushl 72(%ebp)
+; X86-I32-NEXT:    pushl 68(%ebp)
+; X86-I32-NEXT:    pushl 64(%ebp)
+; X86-I32-NEXT:    pushl 60(%ebp)
+; X86-I32-NEXT:    calll lrintl
+; X86-I32-NEXT:    addl $16, %esp
+; X86-I32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I32-NEXT:    pushl 88(%ebp)
+; X86-I32-NEXT:    pushl 84(%ebp)
+; X86-I32-NEXT:    pushl 80(%ebp)
+; X86-I32-NEXT:    pushl 76(%ebp)
+; X86-I32-NEXT:    calll lrintl
+; X86-I32-NEXT:    addl $16, %esp
+; X86-I32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I32-NEXT:    pushl 104(%ebp)
+; X86-I32-NEXT:    pushl 100(%ebp)
+; X86-I32-NEXT:    pushl 96(%ebp)
+; X86-I32-NEXT:    pushl 92(%ebp)
+; X86-I32-NEXT:    calll lrintl
+; X86-I32-NEXT:    addl $16, %esp
+; X86-I32-NEXT:    movl %eax, %ebx
+; X86-I32-NEXT:    pushl 120(%ebp)
+; X86-I32-NEXT:    pushl 116(%ebp)
+; X86-I32-NEXT:    pushl 112(%ebp)
+; X86-I32-NEXT:    pushl 108(%ebp)
+; X86-I32-NEXT:    calll lrintl
+; X86-I32-NEXT:    addl $16, %esp
+; X86-I32-NEXT:    movl %eax, %edi
+; X86-I32-NEXT:    pushl 136(%ebp)
+; X86-I32-NEXT:    pushl 132(%ebp)
+; X86-I32-NEXT:    pushl 128(%ebp)
+; X86-I32-NEXT:    pushl 124(%ebp)
+; X86-I32-NEXT:    calll lrintl
+; X86-I32-NEXT:    addl $16, %esp
+; X86-I32-NEXT:    movl %eax, 28(%esi)
+; X86-I32-NEXT:    movl %edi, 24(%esi)
+; X86-I32-NEXT:    movl %ebx, 20(%esi)
+; X86-I32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-I32-NEXT:    movl %eax, 16(%esi)
+; X86-I32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-I32-NEXT:    movl %eax, 12(%esi)
+; X86-I32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-I32-NEXT:    movl %eax, 8(%esi)
+; X86-I32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-I32-NEXT:    movl %eax, 4(%esi)
+; X86-I32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-I32-NEXT:    movl %eax, (%esi)
+; X86-I32-NEXT:    movl %esi, %eax
+; X86-I32-NEXT:    leal -12(%ebp), %esp
+; X86-I32-NEXT:    popl %esi
+; X86-I32-NEXT:    popl %edi
+; X86-I32-NEXT:    popl %ebx
+; X86-I32-NEXT:    popl %ebp
+; X86-I32-NEXT:    retl $4
+;
+; X86-I64-LABEL: lrint_v8fp128:
+; X86-I64:       # %bb.0:
+; X86-I64-NEXT:    pushl %ebp
+; X86-I64-NEXT:    movl %esp, %ebp
+; X86-I64-NEXT:    pushl %ebx
+; X86-I64-NEXT:    pushl %edi
+; X86-I64-NEXT:    pushl %esi
+; X86-I64-NEXT:    andl $-16, %esp
+; X86-I64-NEXT:    subl $64, %esp
+; X86-I64-NEXT:    movl 8(%ebp), %esi
+; X86-I64-NEXT:    movl 36(%ebp), %edi
+; X86-I64-NEXT:    movl 40(%ebp), %ebx
+; X86-I64-NEXT:    pushl 24(%ebp)
+; X86-I64-NEXT:    pushl 20(%ebp)
+; X86-I64-NEXT:    pushl 16(%ebp)
+; X86-I64-NEXT:    pushl 12(%ebp)
+; X86-I64-NEXT:    calll lrintl
+; X86-I64-NEXT:    addl $16, %esp
+; X86-I64-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    pushl %ebx
+; X86-I64-NEXT:    pushl %edi
+; X86-I64-NEXT:    pushl 32(%ebp)
+; X86-I64-NEXT:    pushl 28(%ebp)
+; X86-I64-NEXT:    calll lrintl
+; X86-I64-NEXT:    addl $16, %esp
+; X86-I64-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    pushl 56(%ebp)
+; X86-I64-NEXT:    pushl 52(%ebp)
+; X86-I64-NEXT:    pushl 48(%ebp)
+; X86-I64-NEXT:    pushl 44(%ebp)
+; X86-I64-NEXT:    calll lrintl
+; X86-I64-NEXT:    addl $16, %esp
+; X86-I64-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    pushl 72(%ebp)
+; X86-I64-NEXT:    pushl 68(%ebp)
+; X86-I64-NEXT:    pushl 64(%ebp)
+; X86-I64-NEXT:    pushl 60(%ebp)
+; X86-I64-NEXT:    calll lrintl
+; X86-I64-NEXT:    addl $16, %esp
+; X86-I64-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    pushl 88(%ebp)
+; X86-I64-NEXT:    pushl 84(%ebp)
+; X86-I64-NEXT:    pushl 80(%ebp)
+; X86-I64-NEXT:    pushl 76(%ebp)
+; X86-I64-NEXT:    calll lrintl
+; X86-I64-NEXT:    addl $16, %esp
+; X86-I64-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    pushl 104(%ebp)
+; X86-I64-NEXT:    pushl 100(%ebp)
+; X86-I64-NEXT:    pushl 96(%ebp)
+; X86-I64-NEXT:    pushl 92(%ebp)
+; X86-I64-NEXT:    calll lrintl
+; X86-I64-NEXT:    addl $16, %esp
+; X86-I64-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    pushl 120(%ebp)
+; X86-I64-NEXT:    pushl 116(%ebp)
+; X86-I64-NEXT:    pushl 112(%ebp)
+; X86-I64-NEXT:    pushl 108(%ebp)
+; X86-I64-NEXT:    calll lrintl
+; X86-I64-NEXT:    addl $16, %esp
+; X86-I64-NEXT:    movl %eax, %edi
+; X86-I64-NEXT:    movl %edx, %ebx
+; X86-I64-NEXT:    pushl 136(%ebp)
+; X86-I64-NEXT:    pushl 132(%ebp)
+; X86-I64-NEXT:    pushl 128(%ebp)
+; X86-I64-NEXT:    pushl 124(%ebp)
+; X86-I64-NEXT:    calll lrintl
+; X86-I64-NEXT:    addl $16, %esp
+; X86-I64-NEXT:    movl %edx, 60(%esi)
+; X86-I64-NEXT:    movl %eax, 56(%esi)
+; X86-I64-NEXT:    movl %ebx, 52(%esi)
+; X86-I64-NEXT:    movl %edi, 48(%esi)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-I64-NEXT:    movl %eax, 44(%esi)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-I64-NEXT:    movl %eax, 40(%esi)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-I64-NEXT:    movl %eax, 36(%esi)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-I64-NEXT:    movl %eax, 32(%esi)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-I64-NEXT:    movl %eax, 28(%esi)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-I64-NEXT:    movl %eax, 24(%esi)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-I64-NEXT:    movl %eax, 20(%esi)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-I64-NEXT:    movl %eax, 16(%esi)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-I64-NEXT:    movl %eax, 12(%esi)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-I64-NEXT:    movl %eax, 8(%esi)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-I64-NEXT:    movl %eax, 4(%esi)
+; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-I64-NEXT:    movl %eax, (%esi)
+; X86-I64-NEXT:    movl %esi, %eax
+; X86-I64-NEXT:    leal -12(%ebp), %esp
+; X86-I64-NEXT:    popl %esi
+; X86-I64-NEXT:    popl %edi
+; X86-I64-NEXT:    popl %ebx
+; X86-I64-NEXT:    popl %ebp
+; X86-I64-NEXT:    retl $4
+;
+; X86-SSE2-LABEL: lrint_v8fp128:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    pushl %ebx
+; X86-SSE2-NEXT:    pushl %edi
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $64, %esp
+; X86-SSE2-NEXT:    movl 108(%ebp), %esi
+; X86-SSE2-NEXT:    movl 112(%ebp), %edi
+; X86-SSE2-NEXT:    movl 116(%ebp), %ebx
+; X86-SSE2-NEXT:    pushl 100(%ebp)
+; X86-SSE2-NEXT:    pushl 96(%ebp)
+; X86-SSE2-NEXT:    pushl 92(%ebp)
+; X86-SSE2-NEXT:    pushl 88(%ebp)
+; X86-SSE2-NEXT:    calll lrintl
+; X86-SSE2-NEXT:    addl $16, %esp
+; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    pushl %ebx
+; X86-SSE2-NEXT:    pushl %edi
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    pushl 104(%ebp)
+; X86-SSE2-NEXT:    calll lrintl
+; X86-SSE2-NEXT:    addl $16, %esp
+; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    pushl 132(%ebp)
+; X86-SSE2-NEXT:    pushl 128(%ebp)
+; X86-SSE2-NEXT:    pushl 124(%ebp)
+; X86-SSE2-NEXT:    pushl 120(%ebp)
+; X86-SSE2-NEXT:    calll lrintl
+; X86-SSE2-NEXT:    addl $16, %esp
+; X86-SSE2-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-SSE2-NEXT:    pushl 20(%ebp)
+; X86-SSE2-NEXT:    pushl 16(%ebp)
+; X86-SSE2-NEXT:    pushl 12(%ebp)
+; X86-SSE2-NEXT:    pushl 8(%ebp)
+; X86-SSE2-NEXT:    calll lrintl
+; X86-SSE2-NEXT:    addl $16, %esp
+; X86-SSE2-NEXT:    movl %eax, %esi
+; X86-SSE2-NEXT:    pushl 36(%ebp)
+; X86-SSE2-NEXT:    pushl 32(%ebp)
+; X86-SSE2-NEXT:    pushl 28(%ebp)
+; X86-SSE2-NEXT:    pushl 24(%ebp)
+; X86-SSE2-NEXT:    calll lrintl
+; X86-SSE2-NEXT:    addl $16, %esp
+; X86-SSE2-NEXT:    movl %eax, %edi
+; X86-SSE2-NEXT:    pushl 52(%ebp)
+; X86-SSE2-NEXT:    pushl 48(%ebp)
+; X86-SSE2-NEXT:    pushl 44(%ebp)
+; X86-SSE2-NEXT:    pushl 40(%ebp)
+; X86-SSE2-NEXT:    calll lrintl
+; X86-SSE2-NEXT:    addl $16, %esp
+; X86-SSE2-NEXT:    movl %eax, %ebx
+; X86-SSE2-NEXT:    pushl 68(%ebp)
+; X86-SSE2-NEXT:    pushl 64(%ebp)
+; X86-SSE2-NEXT:    pushl 60(%ebp)
+; X86-SSE2-NEXT:    pushl 56(%ebp)
+; X86-SSE2-NEXT:    calll lrintl
+; X86-SSE2-NEXT:    addl $16, %esp
+; X86-SSE2-NEXT:    movd %eax, %xmm0
+; X86-SSE2-NEXT:    movd %ebx, %xmm1
+; X86-SSE2-NEXT:    movd %edi, %xmm2
+; X86-SSE2-NEXT:    movd %esi, %xmm4
+; X86-SSE2-NEXT:    movss (%esp), %xmm3 # 4-byte Reload
+; X86-SSE2-NEXT:    # xmm3 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT:    movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm5 # 4-byte Reload
+; X86-SSE2-NEXT:    # xmm5 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT:    movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm6 # 4-byte Reload
+; X86-SSE2-NEXT:    # xmm6 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT:    movaps %xmm6, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
+; X86-SSE2-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-SSE2-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; X86-SSE2-NEXT:    movaps %xmm5, (%esp) # 16-byte Spill
+; X86-SSE2-NEXT:    pushl 84(%ebp)
+; X86-SSE2-NEXT:    pushl 80(%ebp)
+; X86-SSE2-NEXT:    pushl 76(%ebp)
+; X86-SSE2-NEXT:    pushl 72(%ebp)
+; X86-SSE2-NEXT:    calll lrintl
+; X86-SSE2-NEXT:    addl $16, %esp
+; X86-SSE2-NEXT:    movd %eax, %xmm1
+; X86-SSE2-NEXT:    punpckldq {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; X86-SSE2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
+; X86-SSE2-NEXT:    punpcklqdq (%esp), %xmm1 # 16-byte Folded Reload
+; X86-SSE2-NEXT:    # xmm1 = xmm1[0],mem[0]
+; X86-SSE2-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-SSE2-NEXT:    leal -12(%ebp), %esp
+; X86-SSE2-NEXT:    popl %esi
+; X86-SSE2-NEXT:    popl %edi
+; X86-SSE2-NEXT:    popl %ebx
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX1-LABEL: lrint_v8fp128:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    pushl %ebp
+; X86-AVX1-NEXT:    movl %esp, %ebp
+; X86-AVX1-NEXT:    pushl %ebx
+; X86-AVX1-NEXT:    pushl %edi
+; X86-AVX1-NEXT:    pushl %esi
+; X86-AVX1-NEXT:    andl $-16, %esp
+; X86-AVX1-NEXT:    subl $80, %esp
+; X86-AVX1-NEXT:    vmovups 40(%ebp), %xmm0
+; X86-AVX1-NEXT:    vmovups %xmm0, (%esp)
+; X86-AVX1-NEXT:    calll lrintl
+; X86-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-AVX1-NEXT:    vmovups 24(%ebp), %xmm0
+; X86-AVX1-NEXT:    vmovups %xmm0, (%esp)
+; X86-AVX1-NEXT:    calll lrintl
+; X86-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-AVX1-NEXT:    vmovups 8(%ebp), %xmm0
+; X86-AVX1-NEXT:    vmovups %xmm0, (%esp)
+; X86-AVX1-NEXT:    calll lrintl
+; X86-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-AVX1-NEXT:    vmovups 120(%ebp), %xmm0
+; X86-AVX1-NEXT:    vmovups %xmm0, (%esp)
+; X86-AVX1-NEXT:    calll lrintl
+; X86-AVX1-NEXT:    movl %eax, %esi
+; X86-AVX1-NEXT:    vmovups 104(%ebp), %xmm0
+; X86-AVX1-NEXT:    vmovups %xmm0, (%esp)
+; X86-AVX1-NEXT:    calll lrintl
+; X86-AVX1-NEXT:    movl %eax, %edi
+; X86-AVX1-NEXT:    vmovups 88(%ebp), %xmm0
+; X86-AVX1-NEXT:    vmovups %xmm0, (%esp)
+; X86-AVX1-NEXT:    calll lrintl
+; X86-AVX1-NEXT:    movl %eax, %ebx
+; X86-AVX1-NEXT:    vmovups 72(%ebp), %xmm0
+; X86-AVX1-NEXT:    vmovups %xmm0, (%esp)
+; X86-AVX1-NEXT:    calll lrintl
+; X86-AVX1-NEXT:    vmovd %eax, %xmm0
+; X86-AVX1-NEXT:    vpinsrd $1, %ebx, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpinsrd $2, %edi, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpinsrd $3, %esi, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-AVX1-NEXT:    vmovd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    vpinsrd $1, {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; X86-AVX1-NEXT:    vmovups 56(%ebp), %xmm1
+; X86-AVX1-NEXT:    vmovups %xmm1, (%esp)
+; X86-AVX1-NEXT:    vpinsrd $2, {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; X86-AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-AVX1-NEXT:    calll lrintl
+; X86-AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-AVX1-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%e{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; X86-AVX1-NEXT:    leal -12(%ebp), %esp
+; X86-AVX1-NEXT:    popl %esi
+; X86-AVX1-NEXT:    popl %edi
+; X86-AVX1-NEXT:    popl %ebx
+; X86-AVX1-NEXT:    popl %ebp
+; X86-AVX1-NEXT:    retl
+;
+; X64-AVX1-i32-LABEL: lrint_v8fp128:
+; X64-AVX1-i32:       # %bb.0:
+; X64-AVX1-i32-NEXT:    pushq %rbx
+; X64-AVX1-i32-NEXT:    subq $112, %rsp
+; X64-AVX1-i32-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX1-i32-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX1-i32-NEXT:    vmovaps %xmm4, (%rsp) # 16-byte Spill
+; X64-AVX1-i32-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX1-i32-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX1-i32-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX1-i32-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX1-i32-NEXT:    vmovaps %xmm5, %xmm0
+; X64-AVX1-i32-NEXT:    callq lrintl@PLT
+; X64-AVX1-i32-NEXT:    movl %eax, %ebx
+; X64-AVX1-i32-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; X64-AVX1-i32-NEXT:    callq lrintl@PLT
+; X64-AVX1-i32-NEXT:    vmovd %eax, %xmm0
+; X64-AVX1-i32-NEXT:    vpinsrd $1, %ebx, %xmm0, %xmm0
+; X64-AVX1-i32-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; X64-AVX1-i32-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-AVX1-i32-NEXT:    callq lrintl@PLT
+; X64-AVX1-i32-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; X64-AVX1-i32-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
+; X64-AVX1-i32-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX1-i32-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-AVX1-i32-NEXT:    callq lrintl@PLT
+; X64-AVX1-i32-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-AVX1-i32-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
+; X64-AVX1-i32-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX1-i32-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-AVX1-i32-NEXT:    callq lrintl@PLT
+; X64-AVX1-i32-NEXT:    movl %eax, %ebx
+; X64-AVX1-i32-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-AVX1-i32-NEXT:    callq lrintl@PLT
+; X64-AVX1-i32-NEXT:    vmovd %eax, %xmm0
+; X64-AVX1-i32-NEXT:    vpinsrd $1, %ebx, %xmm0, %xmm0
+; X64-AVX1-i32-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX1-i32-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-AVX1-i32-NEXT:    callq lrintl@PLT
+; X64-AVX1-i32-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-AVX1-i32-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
+; X64-AVX1-i32-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX1-i32-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-AVX1-i32-NEXT:    callq lrintl@PLT
+; X64-AVX1-i32-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-AVX1-i32-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
+; X64-AVX1-i32-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; X64-AVX1-i32-NEXT:    addq $112, %rsp
+; X64-AVX1-i32-NEXT:    popq %rbx
+; X64-AVX1-i32-NEXT:    retq
+;
+; X64-AVX1-i64-LABEL: lrint_v8fp128:
+; X64-AVX1-i64:       # %bb.0:
+; X64-AVX1-i64-NEXT:    subq $152, %rsp
+; X64-AVX1-i64-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX1-i64-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX1-i64-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX1-i64-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX1-i64-NEXT:    vmovaps %xmm2, (%rsp) # 16-byte Spill
+; X64-AVX1-i64-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX1-i64-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX1-i64-NEXT:    vmovaps %xmm3, %xmm0
+; X64-AVX1-i64-NEXT:    callq lrintl@PLT
+; X64-AVX1-i64-NEXT:    vmovq %rax, %xmm0
+; X64-AVX1-i64-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX1-i64-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; X64-AVX1-i64-NEXT:    callq lrintl@PLT
+; X64-AVX1-i64-NEXT:    vmovq %rax, %xmm0
+; X64-AVX1-i64-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; X64-AVX1-i64-NEXT:    # xmm0 = xmm0[0],mem[0]
+; X64-AVX1-i64-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; X64-AVX1-i64-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-AVX1-i64-NEXT:    callq lrintl@PLT
+; X64-AVX1-i64-NEXT:    vmovq %rax, %xmm0
+; X64-AVX1-i64-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX1-i64-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-AVX1-i64-NEXT:    callq lrintl@PLT
+; X64-AVX1-i64-NEXT:    vmovq %rax, %xmm0
+; X64-AVX1-i64-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; X64-AVX1-i64-NEXT:    # xmm0 = xmm0[0],mem[0]
+; X64-AVX1-i64-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
+; X64-AVX1-i64-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; X64-AVX1-i64-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-AVX1-i64-NEXT:    vzeroupper
+; X64-AVX1-i64-NEXT:    callq lrintl@PLT
+; X64-AVX1-i64-NEXT:    vmovq %rax, %xmm0
+; X64-AVX1-i64-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX1-i64-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-AVX1-i64-NEXT:    callq lrintl@PLT
+; X64-AVX1-i64-NEXT:    vmovq %rax, %xmm0
+; X64-AVX1-i64-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; X64-AVX1-i64-NEXT:    # xmm0 = xmm0[0],mem[0]
+; X64-AVX1-i64-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX1-i64-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-AVX1-i64-NEXT:    callq lrintl@PLT
+; X64-AVX1-i64-NEXT:    vmovq %rax, %xmm0
+; X64-AVX1-i64-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-AVX1-i64-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-AVX1-i64-NEXT:    callq lrintl@PLT
+; X64-AVX1-i64-NEXT:    vmovq %rax, %xmm0
+; X64-AVX1-i64-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; X64-AVX1-i64-NEXT:    # xmm0 = xmm0[0],mem[0]
+; X64-AVX1-i64-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
+; X64-AVX1-i64-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; X64-AVX1-i64-NEXT:    addq $152, %rsp
+; X64-AVX1-i64-NEXT:    retq
+;
+; AVX512-i64-LABEL: lrint_v8fp128:
+; AVX512-i64:       # %bb.0:
+; AVX512-i64-NEXT:    subq $152, %rsp
+; AVX512-i64-NEXT:    vmovaps %xmm6, (%rsp) # 16-byte Spill
+; AVX512-i64-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-i64-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-i64-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-i64-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-i64-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-i64-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-i64-NEXT:    vmovaps %xmm7, %xmm0
+; AVX512-i64-NEXT:    callq lrintl@PLT
+; AVX512-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512-i64-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-i64-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-i64-NEXT:    callq lrintl@PLT
+; AVX512-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512-i64-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-i64-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512-i64-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512-i64-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-i64-NEXT:    callq lrintl@PLT
+; AVX512-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512-i64-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-i64-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-i64-NEXT:    callq lrintl@PLT
+; AVX512-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512-i64-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-i64-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512-i64-NEXT:    vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512-i64-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-i64-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-i64-NEXT:    vzeroupper
+; AVX512-i64-NEXT:    callq lrintl@PLT
+; AVX512-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512-i64-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-i64-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-i64-NEXT:    callq lrintl@PLT
+; AVX512-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512-i64-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-i64-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512-i64-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-i64-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-i64-NEXT:    callq lrintl@PLT
+; AVX512-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512-i64-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-i64-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-i64-NEXT:    callq lrintl@PLT
+; AVX512-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512-i64-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-i64-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512-i64-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512-i64-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-i64-NEXT:    addq $152, %rsp
+; AVX512-i64-NEXT:    retq
+;
+; AVX512DQ-i64-LABEL: lrint_v8fp128:
+; AVX512DQ-i64:       # %bb.0:
+; AVX512DQ-i64-NEXT:    subq $152, %rsp
+; AVX512DQ-i64-NEXT:    vmovaps %xmm6, (%rsp) # 16-byte Spill
+; AVX512DQ-i64-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-i64-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-i64-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-i64-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-i64-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-i64-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-i64-NEXT:    vmovaps %xmm7, %xmm0
+; AVX512DQ-i64-NEXT:    callq lrintl@PLT
+; AVX512DQ-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512DQ-i64-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-i64-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512DQ-i64-NEXT:    callq lrintl@PLT
+; AVX512DQ-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512DQ-i64-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-i64-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512DQ-i64-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512DQ-i64-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-i64-NEXT:    callq lrintl@PLT
+; AVX512DQ-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512DQ-i64-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-i64-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-i64-NEXT:    callq lrintl@PLT
+; AVX512DQ-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512DQ-i64-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-i64-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512DQ-i64-NEXT:    vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512DQ-i64-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-i64-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-i64-NEXT:    vzeroupper
+; AVX512DQ-i64-NEXT:    callq lrintl@PLT
+; AVX512DQ-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512DQ-i64-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-i64-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-i64-NEXT:    callq lrintl@PLT
+; AVX512DQ-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512DQ-i64-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-i64-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512DQ-i64-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-i64-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-i64-NEXT:    callq lrintl@PLT
+; AVX512DQ-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512DQ-i64-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-i64-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-i64-NEXT:    callq lrintl@PLT
+; AVX512DQ-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512DQ-i64-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-i64-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512DQ-i64-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512DQ-i64-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512DQ-i64-NEXT:    addq $152, %rsp
+; AVX512DQ-i64-NEXT:    retq
+  %a = call <8 x iXLen> @llvm.lrint.v8iXLen.v8fp128(<8 x fp128> %x)
+  ret <8 x iXLen> %a
+}
+declare <8 x iXLen> @llvm.lrint.v8iXLen.v8fp128(<8 x fp128>)
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
index 9cd0f4d..227e000 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
@@ -903,6 +903,95 @@ define i1 @mask_v8i32(<8 x i32> %a0) {
   ret i1 %3
 }
 
+define i1 @mask_v8i32_2(<8 x i32> %a0) {
+; SSE2-LABEL: mask_v8i32_2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pslld $1, %xmm0
+; SSE2-NEXT:    movmskps %xmm0, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    sete %al
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: mask_v8i32_2:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    sete %al
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: mask_v8i32_2:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0
+; AVX1-NEXT:    sete %al
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: mask_v8i32_2:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4611686019501129728,4611686019501129728,4611686019501129728,4611686019501129728]
+; AVX2-NEXT:    vptest %ymm1, %ymm0
+; AVX2-NEXT:    sete %al
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: mask_v8i32_2:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4611686019501129728,4611686019501129728,4611686019501129728,4611686019501129728]
+; AVX512-NEXT:    vptest %ymm1, %ymm0
+; AVX512-NEXT:    sete %al
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a0)
+  %2 = and i32 %1, 1073741824
+  %3 = icmp eq i32 %2, 0
+  ret i1 %3
+}
+
+
+define i1 @signtest_v8i32(<8 x i32> %a0) {
+; SSE2-LABEL: signtest_v8i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    movmskps %xmm0, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    sete %al
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: signtest_v8i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    sete %al
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: signtest_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0
+; AVX1-NEXT:    sete %al
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: signtest_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456]
+; AVX2-NEXT:    vptest %ymm1, %ymm0
+; AVX2-NEXT:    sete %al
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: signtest_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456]
+; AVX512-NEXT:    vptest %ymm1, %ymm0
+; AVX512-NEXT:    sete %al
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a0)
+  %2 = icmp sgt i32 %1, -1
+  ret i1 %2
+}
+
 define i1 @trunc_v16i16(<16 x i16> %a0) {
 ; SSE2-LABEL: trunc_v16i16:
 ; SSE2:       # %bb.0:
@@ -1073,11 +1162,11 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    orl %ecx, %eax
 ; SSE2-NEXT:    testb $1, %al
-; SSE2-NEXT:    je .LBB27_2
+; SSE2-NEXT:    je .LBB29_2
 ; SSE2-NEXT:  # %bb.1:
 ; SSE2-NEXT:    xorl %eax, %eax
 ; SSE2-NEXT:    retq
-; SSE2-NEXT:  .LBB27_2:
+; SSE2-NEXT:  .LBB29_2:
 ; SSE2-NEXT:    movl $1, %eax
 ; SSE2-NEXT:    retq
 ;
@@ -1092,11 +1181,11 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {
 ; SSE41-NEXT:    pextrd $2, %xmm1, %eax
 ; SSE41-NEXT:    orl %ecx, %eax
 ; SSE41-NEXT:    testb $1, %al
-; SSE41-NEXT:    je .LBB27_2
+; SSE41-NEXT:    je .LBB29_2
 ; SSE41-NEXT:  # %bb.1:
 ; SSE41-NEXT:    xorl %eax, %eax
 ; SSE41-NEXT:    retq
-; SSE41-NEXT:  .LBB27_2:
+; SSE41-NEXT:  .LBB29_2:
 ; SSE41-NEXT:    movl $1, %eax
 ; SSE41-NEXT:    retq
 ;
@@ -1111,11 +1200,11 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {
 ; AVX1OR2-NEXT:    vpextrd $2, %xmm0, %eax
 ; AVX1OR2-NEXT:    orl %ecx, %eax
 ; AVX1OR2-NEXT:    testb $1, %al
-; AVX1OR2-NEXT:    je .LBB27_2
+; AVX1OR2-NEXT:    je .LBB29_2
 ; AVX1OR2-NEXT:  # %bb.1:
 ; AVX1OR2-NEXT:    xorl %eax, %eax
 ; AVX1OR2-NEXT:    retq
-; AVX1OR2-NEXT:  .LBB27_2:
+; AVX1OR2-NEXT:  .LBB29_2:
 ; AVX1OR2-NEXT:    movl $1, %eax
 ; AVX1OR2-NEXT:    retq
 ;
@@ -1130,12 +1219,12 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {
 ; AVX512F-NEXT:    korw %k0, %k1, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
 ; AVX512F-NEXT:    testb $1, %al
-; AVX512F-NEXT:    je .LBB27_2
+; AVX512F-NEXT:    je .LBB29_2
 ; AVX512F-NEXT:  # %bb.1:
 ; AVX512F-NEXT:    xorl %eax, %eax
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
-; AVX512F-NEXT:  .LBB27_2:
+; AVX512F-NEXT:  .LBB29_2:
 ; AVX512F-NEXT:    movl $1, %eax
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -1151,12 +1240,12 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {
 ; AVX512BW-NEXT:    korw %k0, %k1, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    testb $1, %al
-; AVX512BW-NEXT:    je .LBB27_2
+; AVX512BW-NEXT:    je .LBB29_2
 ; AVX512BW-NEXT:  # %bb.1:
 ; AVX512BW-NEXT:    xorl %eax, %eax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-; AVX512BW-NEXT:  .LBB27_2:
+; AVX512BW-NEXT:  .LBB29_2:
 ; AVX512BW-NEXT:    movl $1, %eax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1170,11 +1259,11 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {
 ; AVX512BWVL-NEXT:    korw %k0, %k1, %k0
 ; AVX512BWVL-NEXT:    kmovd %k0, %eax
 ; AVX512BWVL-NEXT:    testb $1, %al
-; AVX512BWVL-NEXT:    je .LBB27_2
+; AVX512BWVL-NEXT:    je .LBB29_2
 ; AVX512BWVL-NEXT:  # %bb.1:
 ; AVX512BWVL-NEXT:    xorl %eax, %eax
 ; AVX512BWVL-NEXT:    retq
-; AVX512BWVL-NEXT:  .LBB27_2:
+; AVX512BWVL-NEXT:  .LBB29_2:
 ; AVX512BWVL-NEXT:    movl $1, %eax
 ; AVX512BWVL-NEXT:    retq
   %1 = icmp ne <3 x i32> %a, %b
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
index 87c135d..ef20cf2 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -1724,6 +1724,269 @@ define void @PR54562_mem(ptr %src, ptr %dst) {
   ret void
 }
 
+define <512 x i8> @PR153457(<512 x i8> %a0, <512 x i8> %a1) nounwind {
+; AVX512F-LABEL: PR153457:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    pushq %rbp
+; AVX512F-NEXT:    movq %rsp, %rbp
+; AVX512F-NEXT:    andq $-64, %rsp
+; AVX512F-NEXT:    subq $64, %rsp
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    vpbroadcastq %xmm0, %ymm7
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512F-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm7
+; AVX512F-NEXT:    vpbroadcastd %xmm0, %ymm9
+; AVX512F-NEXT:    vpblendvb %ymm8, %ymm7, %ymm9, %ymm8
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm5, %xmm7
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[0,2,4,6,8,10,12,13,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm4, %xmm9
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,128]
+; AVX512F-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[5]
+; AVX512F-NEXT:    vpor %xmm11, %xmm9, %xmm9
+; AVX512F-NEXT:    vpshufb %xmm10, %xmm1, %xmm10
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1]
+; AVX512F-NEXT:    vpor %xmm11, %xmm10, %xmm10
+; AVX512F-NEXT:    vpslld $24, %xmm0, %xmm11
+; AVX512F-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vpblendvb %ymm12, %ymm3, %ymm11, %ymm3
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,5,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vmovdqa 16(%rbp), %xmm11
+; AVX512F-NEXT:    vpsrld $16, %xmm11, %xmm12
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm12[0]
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm8, %zmm2, %zmm2
+; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm8 = xmm11[0],zero,xmm11[1],zero
+; AVX512F-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7]
+; AVX512F-NEXT:    vpsrld $24, %xmm11, %xmm8
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm8, %zmm3
+; AVX512F-NEXT:    vinserti128 $1, %xmm11, %ymm10, %ymm8
+; AVX512F-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm8[0,1,2,3],zmm1[4,5,6,7]
+; AVX512F-NEXT:    vinserti128 $1, %xmm11, %ymm9, %ymm8
+; AVX512F-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,21,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm8, %zmm4, %zmm4
+; AVX512F-NEXT:    vpsrlq $48, %xmm11, %xmm8
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm8[0]
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm7, %zmm5, %zmm5
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm7 = ymm0[0,1,2,0]
+; AVX512F-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,24,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vpbroadcastb 16(%rbp), %ymm8
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7]
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm0[4,5,6,7]
+; AVX512F-NEXT:    vpsrlq $56, %xmm11, %xmm7
+; AVX512F-NEXT:    vmovdqa %ymm7, 416(%rdi)
+; AVX512F-NEXT:    vmovdqa %ymm6, 384(%rdi)
+; AVX512F-NEXT:    vmovdqa64 %zmm0, (%rdi)
+; AVX512F-NEXT:    vmovdqa64 %zmm5, 320(%rdi)
+; AVX512F-NEXT:    vmovdqa64 %zmm3, 192(%rdi)
+; AVX512F-NEXT:    vmovdqa64 %zmm2, 128(%rdi)
+; AVX512F-NEXT:    vmovdqa64 %zmm4, 256(%rdi)
+; AVX512F-NEXT:    vmovdqa64 %zmm1, 64(%rdi)
+; AVX512F-NEXT:    movq %rbp, %rsp
+; AVX512F-NEXT:    popq %rbp
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: PR153457:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    pushq %rbp
+; AVX512BW-NEXT:    movq %rsp, %rbp
+; AVX512BW-NEXT:    andq $-64, %rsp
+; AVX512BW-NEXT:    subq $64, %rsp
+; AVX512BW-NEXT:    movq %rdi, %rax
+; AVX512BW-NEXT:    vmovdqa64 16(%rbp), %zmm7
+; AVX512BW-NEXT:    vpbroadcastq %xmm0, %ymm8
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512BW-NEXT:    vpblendvb %ymm9, %ymm6, %ymm8, %ymm6
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm2, %ymm8
+; AVX512BW-NEXT:    vpbroadcastd %xmm0, %ymm10
+; AVX512BW-NEXT:    vpblendvb %ymm9, %ymm8, %ymm10, %ymm8
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,5,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm2, %zmm2
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm5, %xmm8
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[0,2,4,6,8,10,12,13,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm8, %zmm5, %zmm5
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,128,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,128,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63]
+; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm4, %xmm9
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm9, %xmm9
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[5]
+; AVX512BW-NEXT:    vpor %xmm10, %xmm9, %xmm9
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm9, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpslld $24, %xmm0, %xmm9
+; AVX512BW-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm3, %ymm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpblendvb %ymm10, %ymm3, %ymm9, %ymm3
+; AVX512BW-NEXT:    vpshufb %zmm8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vporq %zmm8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vinserti128 $1, %xmm7, %ymm1, %ymm8
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm8[0,1,2,3],zmm1[4,5,6,7]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX512BW-NEXT:    vpermi2w %zmm7, %zmm2, %zmm8
+; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm7[0],zero,xmm7[1],zero
+; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
+; AVX512BW-NEXT:    vpsrld $24, %xmm7, %xmm3
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm3
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,39,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vinserti32x4 $3, %xmm7, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,53,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm0[0,1,2,0]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,24,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpbroadcastb 16(%rbp), %ymm9
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7]
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm6[0,1,2,3],zmm0[4,5,6,7]
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [16,17,18,19,35,0,0,0,8,9,10,11,12,13,14,15,16,17,18,19,35,0,0,0,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermi2w %zmm7, %zmm5, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 320(%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 384(%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 256(%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 192(%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, 128(%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rdi)
+; AVX512BW-NEXT:    movq %rbp, %rsp
+; AVX512BW-NEXT:    popq %rbp
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: PR153457:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    pushq %rbp
+; AVX512DQ-NEXT:    movq %rsp, %rbp
+; AVX512DQ-NEXT:    andq $-64, %rsp
+; AVX512DQ-NEXT:    subq $64, %rsp
+; AVX512DQ-NEXT:    movq %rdi, %rax
+; AVX512DQ-NEXT:    vpbroadcastq %xmm0, %ymm7
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512DQ-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm7
+; AVX512DQ-NEXT:    vpbroadcastd %xmm0, %ymm9
+; AVX512DQ-NEXT:    vpblendvb %ymm8, %ymm7, %ymm9, %ymm8
+; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm5, %xmm7
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[0,2,4,6,8,10,12,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm4, %xmm9
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,128]
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[5]
+; AVX512DQ-NEXT:    vpor %xmm11, %xmm9, %xmm9
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm1, %xmm10
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1]
+; AVX512DQ-NEXT:    vpor %xmm11, %xmm10, %xmm10
+; AVX512DQ-NEXT:    vpslld $24, %xmm0, %xmm11
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm3, %ymm3
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpblendvb %ymm12, %ymm3, %ymm11, %ymm3
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovdqa 16(%rbp), %xmm11
+; AVX512DQ-NEXT:    vpsrld $16, %xmm11, %xmm12
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm12[0]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vpmovzxdq {{.*#+}} xmm8 = xmm11[0],zero,xmm11[1],zero
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-NEXT:    vpsrld $24, %xmm11, %xmm8
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm8, %zmm3
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm10, %ymm8
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm8[0,1,2,3],zmm1[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm9, %ymm8
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,21,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm4, %zmm4
+; AVX512DQ-NEXT:    vpsrlq $48, %xmm11, %xmm8
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm8[0]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm5, %zmm5
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm0[0,1,2,0]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,24,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpbroadcastb 16(%rbp), %ymm8
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm0[4,5,6,7]
+; AVX512DQ-NEXT:    vpsrlq $56, %xmm11, %xmm7
+; AVX512DQ-NEXT:    vmovdqa %ymm7, 416(%rdi)
+; AVX512DQ-NEXT:    vmovdqa %ymm6, 384(%rdi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 320(%rdi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 192(%rdi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 128(%rdi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 256(%rdi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 64(%rdi)
+; AVX512DQ-NEXT:    movq %rbp, %rsp
+; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VBMI-LABEL: PR153457:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    pushq %rbp
+; AVX512VBMI-NEXT:    movq %rsp, %rbp
+; AVX512VBMI-NEXT:    andq $-64, %rsp
+; AVX512VBMI-NEXT:    subq $64, %rsp
+; AVX512VBMI-NEXT:    movq %rdi, %rax
+; AVX512VBMI-NEXT:    vmovdqa64 16(%rbp), %zmm7
+; AVX512VBMI-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [32,33,34,35,36,37,38,70,0,0,0,0,0,0,0,0,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,70,0,0,0,0,0,0,0,0,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX512VBMI-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT:    vpermi2b %zmm0, %zmm5, %zmm8
+; AVX512VBMI-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,69,0,0,0,0,0,0,0,0,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,69,0,0,0,0,0,0,0,0,24,25,26,27,28,29,30,31]
+; AVX512VBMI-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT:    vpermi2b %zmm0, %zmm4, %zmm5
+; AVX512VBMI-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,68,0,0,0,0,0,0,0,0,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,68,0,0,0,0,0,0,0,0]
+; AVX512VBMI-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT:    vpermi2b %zmm0, %zmm3, %zmm4
+; AVX512VBMI-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,2,3,4,5,6,66,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,67,0,1,2,3,4,5,6,66,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,67]
+; AVX512VBMI-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT:    vpermi2b %zmm0, %zmm2, %zmm3
+; AVX512VBMI-NEXT:    vpshufb {{.*#+}} zmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63]
+; AVX512VBMI-NEXT:    vporq %zmm2, %zmm1, %zmm1
+; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,71]
+; AVX512VBMI-NEXT:    vpermi2b %zmm0, %zmm6, %zmm2
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,0,64,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,58,59,60,61,62,63]
+; AVX512VBMI-NEXT:    vpermi2b %zmm7, %zmm0, %zmm6
+; AVX512VBMI-NEXT:    vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX512VBMI-NEXT:    vpermi2w %zmm7, %zmm3, %zmm0
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [67,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,68,u,u,u,u,u,u,u]
+; AVX512VBMI-NEXT:    vpermi2b %zmm7, %zmm4, %zmm3
+; AVX512VBMI-NEXT:    vinserti32x4 $3, %xmm7, %zmm5, %zmm4
+; AVX512VBMI-NEXT:    vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,53,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VBMI-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [16,17,18,19,35,0,0,0,8,9,10,11,12,13,14,15,16,17,18,19,35,0,0,0,8,9,10,11,12,13,14,15]
+; AVX512VBMI-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT:    vpermi2w %zmm7, %zmm8, %zmm5
+; AVX512VBMI-NEXT:    vinserti64x4 $1, %ymm7, %zmm2, %zmm2
+; AVX512VBMI-NEXT:    vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,39,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,65,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63]
+; AVX512VBMI-NEXT:    vpermi2b %zmm7, %zmm1, %zmm8
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm5, 320(%rdi)
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm4, 256(%rdi)
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm3, 192(%rdi)
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm0, 128(%rdi)
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm8, 64(%rdi)
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm6, (%rdi)
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm2, 384(%rdi)
+; AVX512VBMI-NEXT:    movq %rbp, %rsp
+; AVX512VBMI-NEXT:    popq %rbp
+; AVX512VBMI-NEXT:    vzeroupper
+; AVX512VBMI-NEXT:    retq
+  %shuffle1 = shufflevector <512 x i8> %a0, <512 x i8> zeroinitializer, <512 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 280, i32 281, i32 282, i32 283, i32 284, i32 285, i32 286, i32 287, i32 288, i32 289, i32 290, i32 291, i32 292, i32 293, i32 294, i32 295, i32 296, i32 297, i32 298, i32 299, i32 300, i32 301, i32 302, i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 336, i32 337, i32 338, i32 339, i32 340, i32 341, i32 342, i32 343, i32 344, i32 345, i32 346, i32 347, i32 348, i32 349, i32 350, i32 351, i32 352, i32 353, i32 354, i32 355, i32 356, i32 357, i32 358, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 392, i32 393, i32 394, i32 395, i32 396, i32 397, i32 398, i32 399, i32 400, i32 401, i32 402, i32 403, i32 404, i32 405, i32 406, i32 407, i32 408, i32 409, i32 410, i32 411, i32 412, i32 413, i32 414, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %shuffle2 = shufflevector <512 x i8> %shuffle1, <512 x i8> %a1, <512 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 512, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 513, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 514, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 515, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 516, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 280, i32 281, i32 282, i32 283, i32 284, i32 285, i32 286, i32 287, i32 288, i32 289, i32 290, i32 291, i32 292, i32 293, i32 294, i32 295, i32 296, i32 297, i32 298, i32 299, i32 300, i32 301, i32 302, i32 303, i32 517, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 336, i32 337, i32 338, i32 339, i32 340, i32 341, i32 342, i32 343, i32 344, i32 345, i32 346, i32 347, i32 348, i32 349, i32 350, i32 351, i32 352, i32 353, i32 354, i32 355, i32 356, i32 357, i32 358, i32 359, i32 518, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 392, i32 393, i32 394, i32 395, i32 396, i32 397, i32 398, i32 399, i32 400, i32 401, i32 402, i32 403, i32 404, i32 405, i32 406, i32 407, i32 408, i32 409, i32 410, i32 411, i32 412, i32 413, i32 414, i32 415, i32 519, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  ret <512 x i8> %shuffle2
+}
+
 define <64 x i8> @shuffle_v32i16_zextinreg_to_v16i32(<64 x i8> %a)  {
 ; ALL-LABEL: shuffle_v32i16_zextinreg_to_v16i32:
 ; ALL:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vectorization-remarks-loopid-dbg.ll b/llvm/test/CodeGen/X86/vectorization-remarks-loopid-dbg.ll
new file mode 100644
index 0000000..3194940
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vectorization-remarks-loopid-dbg.ll
@@ -0,0 +1,66 @@
+; RUN: llc < %s -mtriple x86_64-pc-linux-gnu -o - | FileCheck -check-prefix=DEBUG-OUTPUT %s
+; DEBUG-OUTPUT-NOT: .loc
+; DEBUG-OUTPUT-NOT: {{.*}}.debug_info
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define i32 @foo(i32 %n) #0 !dbg !4 {
+entry:
+  %diff = alloca i32, align 4
+  %cb = alloca [16 x i8], align 16
+  %cc = alloca [16 x i8], align 16
+  store i32 0, ptr %diff, align 4, !tbaa !11
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %add8 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds [16 x i8], ptr %cb, i64 0, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1, !tbaa !21
+  %conv = sext i8 %0 to i32
+  %arrayidx2 = getelementptr inbounds [16 x i8], ptr %cc, i64 0, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx2, align 1, !tbaa !21
+  %conv3 = sext i8 %1 to i32
+  %sub = sub i32 %conv, %conv3
+  %add = add nsw i32 %sub, %add8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 16
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !25
+
+for.end:                                          ; preds = %for.body
+  store i32 %add, ptr %diff, align 4, !tbaa !11
+  call void @ibar(ptr %diff) #2
+  ret i32 0
+}
+
+declare void @ibar(ptr) #1
+
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+!llvm.dbg.cu = !{!24}
+
+!1 = !DIFile(filename: "vectorization-remarks.c", directory: ".")
+!2 = !{}
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "foo", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !24, scopeLine: 6, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "vectorization-remarks.c", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 1, !"Debug Info Version", i32 3}
+!9 = !{!"clang version 3.5.0 "}
+!10 = !DILocation(line: 8, column: 3, scope: !4)
+!11 = !{!12, !12, i64 0}
+!12 = !{!"int", !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C/C++ TBAA"}
+!15 = !DILocation(line: 17, column: 8, scope: !16)
+!16 = distinct !DILexicalBlock(line: 17, column: 8, file: !1, scope: !17)
+!17 = distinct !DILexicalBlock(line: 17, column: 8, file: !1, scope: !18)
+!18 = distinct !DILexicalBlock(line: 17, column: 3, file: !1, scope: !4)
+!19 = !DILocation(line: 18, column: 5, scope: !20)
+!20 = distinct !DILexicalBlock(line: 17, column: 27, file: !1, scope: !18)
+!21 = !{!13, !13, i64 0}
+!22 = !DILocation(line: 20, column: 3, scope: !4)
+!23 = !DILocation(line: 21, column: 3, scope: !4)
+!24 = distinct !DICompileUnit(language: DW_LANG_C89, file: !1, emissionKind: NoDebug)
+!25 = !{!25, !15}
diff --git a/llvm/test/CodeGen/X86/win64-eh-empty-block-2.mir b/llvm/test/CodeGen/X86/win64-eh-empty-block-2.mir
index 6f2741f..b97bccd 100644
--- a/llvm/test/CodeGen/X86/win64-eh-empty-block-2.mir
+++ b/llvm/test/CodeGen/X86/win64-eh-empty-block-2.mir
@@ -120,8 +120,8 @@ frameInfo:
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:
   - { id: 0, type: default, offset: -24, size: 8, alignment: 8, stack-id: default,
       isImmutable: false, isAliased: false, callee-saved-register: '',
diff --git a/llvm/test/CodeGen/X86/win64-eh-unwindv2-errors.mir b/llvm/test/CodeGen/X86/win64-eh-unwindv2-errors.mir
index f099d4f..474b776 100644
--- a/llvm/test/CodeGen/X86/win64-eh-unwindv2-errors.mir
+++ b/llvm/test/CodeGen/X86/win64-eh-unwindv2-errors.mir
@@ -97,18 +97,48 @@ body:             |
     RET64
 ...
 
-;--- double_dealloc.mir
-# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - %t/double_dealloc.mir \
-# RUN:    -run-pass=x86-wineh-unwindv2 2>&1 | FileCheck %s \
-# RUN:    --check-prefix=DOUBLE-DEALLOC
-# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/double_dealloc.mir \
+;--- dealloc_after_epilog.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/dealloc_after_epilog.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=DEALLOC-AFTER-EPILOG
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/dealloc_after_epilog.mir -run-pass=x86-wineh-unwindv2 \
+# RUN:    -x86-wineh-unwindv2-force-mode=1 |  FileCheck %s \
+# RUN:    --check-prefix=BESTEFFORT
+# DEALLOC-AFTER-EPILOG: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'dealloc_after_epilog':
+# DEALLOC-AFTER-EPILOG-SAME: Unexpected lea or add instruction after the epilog
+
+--- |
+  define dso_local void @dealloc_after_epilog() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            dealloc_after_epilog
+body:             |
+  bb.0.entry:
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    $rsp = frame-destroy ADD64ri32 $rsp, 40, implicit-def dead $eflags
+    RET64
+...
+
+;--- pop_before_dealloc.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/pop_before_dealloc.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=POP-BEFORE-DEALLOC
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/pop_before_dealloc.mir \
 # RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
 # RUN:    FileCheck %s --check-prefix=BESTEFFORT
-# DOUBLE-DEALLOC: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'double_dealloc':
-# DOUBLE-DEALLOC-SAME: The epilog is deallocating the stack allocation more than once
+# POP-BEFORE-DEALLOC: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'pop_before_dealloc':
+# POP-BEFORE-DEALLOC-SAME: Cannot pop registers before the stack allocation has been deallocated
 
 --- |
-  define dso_local void @double_dealloc() local_unnamed_addr {
+  define dso_local void @pop_before_dealloc() local_unnamed_addr {
   entry:
     ret void
   }
@@ -116,32 +146,63 @@ body:             |
   !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
 ...
 ---
-name:            double_dealloc
+name:            pop_before_dealloc
 body:             |
   bb.0.entry:
+    frame-setup PUSH64r killed $rdi, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_PushReg 55
     $rsp = frame-setup SUB64ri32 $rsp, 40, implicit-def dead $eflags
     frame-setup SEH_StackAlloc 40
     frame-setup SEH_EndPrologue
     SEH_BeginEpilogue
-    $rsp = frame-destroy ADD64ri32 $rsp, 40, implicit-def dead $eflags
+    $rdi = frame-destroy POP64r implicit-def $rsp, implicit $rsp
     $rsp = frame-destroy ADD64ri32 $rsp, 40, implicit-def dead $eflags
     SEH_EndEpilogue
     RET64
 ...
 
-;--- dealloc_after_epilog.mir
+;--- mov_no_setframe.mir
 # RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
-# RUN:    %t/dealloc_after_epilog.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
-# RUN:    FileCheck %s --check-prefix=DEALLOC-AFTER-EPILOG
+# RUN:    %t/mov_no_setframe.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=MOV-NO-SETFRAME
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/mov_no_setframe.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# MOV-NO-SETFRAME: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'mov_no_setframe':
+# MOV-NO-SETFRAME-SAME: The epilog is setting frame back, but prolog did not set it
+
+--- |
+  define dso_local void @mov_no_setframe() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            mov_no_setframe
+body:             |
+  bb.0.entry:
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    $rsp = MOV64rr $rbp
+    SEH_EndEpilogue
+    RET64
+...
+
+;--- mov_after_epilog.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/mov_after_epilog.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=MOV-AFTER-EPILOG
 # RUN: llc -mtriple=x86_64-pc-windows-msvc -o - \
-# RUN:    %t/dealloc_after_epilog.mir -run-pass=x86-wineh-unwindv2 \
+# RUN:    %t/mov_after_epilog.mir -run-pass=x86-wineh-unwindv2 \
 # RUN:    -x86-wineh-unwindv2-force-mode=1 |  FileCheck %s \
 # RUN:    --check-prefix=BESTEFFORT
-# DEALLOC-AFTER-EPILOG: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'dealloc_after_epilog':
-# DEALLOC-AFTER-EPILOG-SAME: Unexpected mov or add instruction after the epilog
+# MOV-AFTER-EPILOG: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'mov_after_epilog':
+# MOV-AFTER-EPILOG-SAME: Unexpected mov instruction after the epilog
 
 --- |
-  define dso_local void @dealloc_after_epilog() local_unnamed_addr {
+  define dso_local void @mov_after_epilog() local_unnamed_addr {
   entry:
     ret void
   }
@@ -149,28 +210,30 @@ body:             |
   !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
 ...
 ---
-name:            dealloc_after_epilog
+name:            mov_after_epilog
 body:             |
   bb.0.entry:
+    $rbp = MOV64rr $rsp
+    frame-setup SEH_SetFrame 52, 0
     frame-setup SEH_EndPrologue
     SEH_BeginEpilogue
     SEH_EndEpilogue
-    $rsp = frame-destroy ADD64ri32 $rsp, 40, implicit-def dead $eflags
+    $rsp = MOV64rr $rbp
     RET64
 ...
 
-;--- pop_before_dealloc.mir
+;--- pop_before_mov.mir
 # RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
-# RUN:    %t/pop_before_dealloc.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
-# RUN:    FileCheck %s --check-prefix=POP-BEFORE-DEALLOC
-# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/pop_before_dealloc.mir \
+# RUN:    %t/pop_before_mov.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=POP-BEFORE-MOV
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/pop_before_mov.mir \
 # RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
 # RUN:    FileCheck %s --check-prefix=BESTEFFORT
-# POP-BEFORE-DEALLOC: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'pop_before_dealloc':
-# POP-BEFORE-DEALLOC-SAME: Cannot pop registers before the stack allocation has been deallocated
+# POP-BEFORE-MOV: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'pop_before_mov':
+# POP-BEFORE-MOV-SAME: The epilog is setting the frame back after popping registers
 
 --- |
-  define dso_local void @pop_before_dealloc() local_unnamed_addr {
+  define dso_local void @pop_before_mov() local_unnamed_addr {
   entry:
     ret void
   }
@@ -178,17 +241,51 @@ body:             |
   !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
 ...
 ---
-name:            pop_before_dealloc
+name:            pop_before_mov
 body:             |
   bb.0.entry:
     frame-setup PUSH64r killed $rdi, implicit-def $rsp, implicit $rsp
     frame-setup SEH_PushReg 55
+    $rbp = MOV64rr $rsp
+    frame-setup SEH_SetFrame 52, 0
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    $rdi = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    $rsp = MOV64rr $rbp
+    SEH_EndEpilogue
+    RET64
+...
+
+;--- mov_after_dealloc.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/mov_after_dealloc.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=MOV-AFTER-DEALLOC
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/mov_after_dealloc.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# MOV-AFTER-DEALLOC: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'mov_after_dealloc':
+# MOV-AFTER-DEALLOC-SAME: Cannot set the frame back after the stack allocation has been deallocated
+
+--- |
+  define dso_local void @mov_after_dealloc() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            mov_after_dealloc
+body:             |
+  bb.0.entry:
+    $rbp = MOV64rr $rsp
+    frame-setup SEH_SetFrame 52, 0
     $rsp = frame-setup SUB64ri32 $rsp, 40, implicit-def dead $eflags
     frame-setup SEH_StackAlloc 40
     frame-setup SEH_EndPrologue
     SEH_BeginEpilogue
-    $rdi = frame-destroy POP64r implicit-def $rsp, implicit $rsp
     $rsp = frame-destroy ADD64ri32 $rsp, 40, implicit-def dead $eflags
+    $rsp = MOV64rr $rbp
     SEH_EndEpilogue
     RET64
 ...
@@ -316,3 +413,38 @@ body:             |
     $ecx = MOV32rr killed $eax
     RET64
 ...
+
+;--- dealloc_pop_dealloc.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/dealloc_pop_dealloc.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=DEALLOC-POP-DEALLOC
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/dealloc_pop_dealloc.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# DEALLOC-POP-DEALLOC: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'dealloc_pop_dealloc':
+# DEALLOC-POP-DEALLOC-SAME: The epilog is deallocating a stack allocation after popping registers
+
+--- |
+  define dso_local void @dealloc_pop_dealloc() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            dealloc_pop_dealloc
+body:             |
+  bb.0.entry:
+    frame-setup PUSH64r killed $rdi, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_PushReg 55
+    $rsp = frame-setup SUB64ri32 $rsp, 40, implicit-def dead $eflags
+    frame-setup SEH_StackAlloc 40
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    $rsp = frame-destroy ADD64ri32 $rsp, 20, implicit-def dead $eflags
+    $rdi = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    $rsp = frame-destroy ADD64ri32 $rsp, 20, implicit-def dead $eflags
+    SEH_EndEpilogue
+    RET64
+...
diff --git a/llvm/test/CodeGen/X86/win64-eh-unwindv2-push-pop-stack-alloc.mir b/llvm/test/CodeGen/X86/win64-eh-unwindv2-push-pop-stack-alloc.mir
new file mode 100644
index 0000000..09a839f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/win64-eh-unwindv2-push-pop-stack-alloc.mir
@@ -0,0 +1,199 @@
+# RUN: llc -o - %s -mtriple=x86_64-unknown-windows-msvc \
+# RUN:  -run-pass=x86-wineh-unwindv2 | FileCheck %s
+
+# Regression test for Win x64 unwind v2: in some cases it is better to use
+# push+pop to adjust the stack, rather than sub+add. This is permitted with
+# unwind v2 as the requirement is that the epilog finishes adjusting the stack
+# before popping the registers listed in the unwind table.
+
+# Pushes and pops the same register.
+# CHECK-LABEL:  name:            push_pop_same
+# CHECK:        body:
+# CHECK-NEXT:     bb.0
+# CHECK-NEXT:       SEH_UnwindVersion 2
+# CHECK-NEXT:       frame-setup PUSH64r undef $rax
+# CHECK-NEXT:       frame-setup SEH_StackAlloc 8
+# CHECK-NEXT:       frame-setup SEH_EndPrologue
+# CHECK-NEXT:       SEH_BeginEpilogue
+# CHECK-NEXT:       $rax = frame-destroy
+# CHECK-NEXT:       SEH_UnwindV2Start
+# CHECK-NEXT:       SEH_EndEpilogue
+# CHECK-NEXT:       RET64
+
+# Pushes and pops a different register.
+# CHECK-LABEL:  name:            push_pop_different
+# CHECK:        body:
+# CHECK-NEXT:     bb.0
+# CHECK-NEXT:       SEH_UnwindVersion 2
+# CHECK-NEXT:       frame-setup PUSH64r undef $rax
+# CHECK-NEXT:       frame-setup SEH_StackAlloc 8
+# CHECK-NEXT:       frame-setup SEH_EndPrologue
+# CHECK:            SEH_BeginEpilogue
+# CHECK-NEXT:       $rcx = frame-destroy POP64r
+# CHECK-NEXT:       SEH_UnwindV2Start
+# CHECK-NEXT:       SEH_EndEpilogue
+# CHECK-NEXT:       RET64 $eax
+
+# Pushes in the prolog, adds in the epilog.
+# CHECK-LABEL:  name:            push_add
+# CHECK:        body:
+# CHECK-NEXT:     bb.0
+# CHECK-NEXT:       SEH_UnwindVersion 2
+# CHECK-NEXT:       frame-setup PUSH64r killed $r15
+# CHECK-NEXT:       frame-setup SEH_PushReg 126
+# CHECK-NEXT:       frame-setup PUSH64r killed $r14
+# CHECK-NEXT:       frame-setup SEH_PushReg 125
+# CHECK-NEXT:       frame-setup PUSH64r killed $rsi
+# CHECK-NEXT:       frame-setup SEH_PushReg 60
+# CHECK-NEXT:       frame-setup PUSH64r killed $rdi
+# CHECK-NEXT:       frame-setup SEH_PushReg 55
+# CHECK-NEXT:       frame-setup PUSH64r killed $rbx
+# CHECK-NEXT:       frame-setup SEH_PushReg 53
+# CHECK-NEXT:       frame-setup PUSH64r undef $rax
+# CHECK-NEXT:       frame-setup SEH_StackAlloc 8
+# CHECK-NEXT:       frame-setup SEH_EndPrologue
+# CHECK:            SEH_BeginEpilogue
+# CHECK-NEXT:       $rsp = frame-destroy ADD64ri32 $rsp, 8
+# CHECK-NEXT:       SEH_UnwindV2Start
+# CHECK-NEXT:       $rbx = frame-destroy POP64r
+# CHECK-NEXT:       $rdi = frame-destroy POP64r
+# CHECK-NEXT:       $rsi = frame-destroy POP64r
+# CHECK-NEXT:       $r14 = frame-destroy POP64r
+# CHECK-NEXT:       $r15 = frame-destroy POP64r
+# CHECK-NEXT:       SEH_EndEpilogue
+# CHECK-NEXT:       RET64
+
+--- |
+  define void @push_pop_same() {
+    %small_alloca = alloca i32, align 4
+    ret void
+  }
+  
+  define i32 @push_pop_different(i32 %x, i32 %y) {
+    %small_alloca = alloca i32, align 4
+    %sum = add i32 %x, %y
+    ret i32 %sum
+  }
+  
+  define void @push_add(ptr %a, ptr %b, ptr %out) {
+    %small_alloca = alloca i32, align 4
+    %av = load i256, ptr %a, align 16
+    %bv = load i256, ptr %b, align 16
+    %r = mul i256 %av, %bv
+    store i256 %r, ptr %out, align 16
+    ret void
+  }
+  
+  !llvm.module.flags = !{!0}
+  
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            push_pop_same
+body:             |
+  bb.0 (%ir-block.0):
+    frame-setup PUSH64r undef $rax, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_StackAlloc 8
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    $rax = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    SEH_EndEpilogue
+    RET64
+...
+---
+name:            push_pop_different
+body:             |
+  bb.0 (%ir-block.0):
+    frame-setup PUSH64r undef $rax, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_StackAlloc 8
+    frame-setup SEH_EndPrologue
+    renamable $edx = KILL $edx, implicit-def $rdx
+    renamable $ecx = KILL $ecx, implicit-def $rcx
+    renamable $eax = LEA64_32r killed renamable $rcx, 1, killed renamable $rdx, 0, $noreg
+    SEH_BeginEpilogue
+    $rcx = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    SEH_EndEpilogue
+    RET64 $eax
+...
+---
+name:            push_add
+body:             |
+  bb.0 (%ir-block.0):
+
+    frame-setup PUSH64r killed $r15, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_PushReg 126
+    frame-setup PUSH64r killed $r14, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_PushReg 125
+    frame-setup PUSH64r killed $rsi, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_PushReg 60
+    frame-setup PUSH64r killed $rdi, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_PushReg 55
+    frame-setup PUSH64r killed $rbx, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_PushReg 53
+    frame-setup PUSH64r undef $rax, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_StackAlloc 8
+    frame-setup SEH_EndPrologue
+    $rsi = MOV64rr $rdx
+    renamable $r9 = MOV64rm renamable $rcx, 1, $noreg, 0, $noreg :: (load (s64) from %ir.a, align 16)
+    renamable $rdi = MOV64rm renamable $rcx, 1, $noreg, 8, $noreg :: (load (s64) from %ir.a + 8, basealign 16)
+    renamable $rbx = MOV64rm renamable $rcx, 1, $noreg, 16, $noreg :: (load (s64) from %ir.a + 16, align 16)
+    renamable $r10 = MOV64rm $rdx, 1, $noreg, 16, $noreg :: (load (s64) from %ir.b + 16, align 16)
+    renamable $r11 = MOV64rm $rdx, 1, $noreg, 0, $noreg :: (load (s64) from %ir.b, align 16)
+    renamable $r14 = MOV64rm $rdx, 1, $noreg, 8, $noreg :: (load (s64) from %ir.b + 8, basealign 16)
+    renamable $r15 = MOV64rm killed renamable $rcx, 1, $noreg, 24, $noreg :: (load (s64) from %ir.a + 24, basealign 16)
+    renamable $r15 = IMUL64rr killed renamable $r15, renamable $r11, implicit-def dead $eflags
+    $rax = MOV64rr $r11
+    MUL64r renamable $rbx, implicit-def $rax, implicit-def $rdx, implicit-def dead $eflags, implicit $rax
+    $rcx = MOV64rr $rax
+    renamable $rbx = IMUL64rr killed renamable $rbx, renamable $r14, implicit-def dead $eflags
+    renamable $rbx = ADD64rr killed renamable $rbx, killed renamable $rdx, implicit-def dead $eflags
+    renamable $rbx = ADD64rr killed renamable $rbx, killed renamable $r15, implicit-def dead $eflags
+    $r15 = MOV64rr $r10
+    renamable $r15 = IMUL64rr killed renamable $r15, renamable $rdi, implicit-def dead $eflags
+    $rax = MOV64rr killed $r10
+    MUL64r renamable $r9, implicit-def $rax, implicit-def $rdx, implicit-def dead $eflags, implicit $rax
+    $r10 = MOV64rr $rax
+    renamable $rdx = ADD64rr killed renamable $rdx, killed renamable $r15, implicit-def dead $eflags
+    renamable $r15 = MOV64rm killed renamable $rsi, 1, $noreg, 24, $noreg :: (load (s64) from %ir.b + 24, basealign 16)
+    renamable $r15 = IMUL64rr killed renamable $r15, renamable $r9, implicit-def dead $eflags
+    renamable $r15 = ADD64rr killed renamable $r15, killed renamable $rdx, implicit-def dead $eflags
+    renamable $r10 = ADD64rr killed renamable $r10, killed renamable $rcx, implicit-def $eflags
+    renamable $r15 = ADC64rr killed renamable $r15, killed renamable $rbx, implicit-def dead $eflags, implicit killed $eflags
+    $rax = MOV64rr $r9
+    MUL64r renamable $r11, implicit-def $rax, implicit-def $rdx, implicit-def dead $eflags, implicit $rax
+    $rcx = MOV64rr $rdx
+    $rsi = MOV64rr $rax
+    $rax = MOV64rr $rdi
+    MUL64r killed renamable $r11, implicit-def $rax, implicit-def $rdx, implicit-def dead $eflags, implicit $rax
+    $r11 = MOV64rr $rdx
+    $rbx = MOV64rr $rax
+    renamable $rbx = ADD64rr killed renamable $rbx, killed renamable $rcx, implicit-def $eflags
+    renamable $r11 = ADC64ri32 killed renamable $r11, 0, implicit-def dead $eflags, implicit killed $eflags
+    $rax = MOV64rr killed $r9
+    MUL64r renamable $r14, implicit-def $rax, implicit-def $rdx, implicit-def dead $eflags, implicit $rax
+    $rcx = MOV64rr $rdx
+    $r9 = MOV64rr $rax
+    renamable $r9 = ADD64rr killed renamable $r9, killed renamable $rbx, implicit-def $eflags
+    renamable $rcx = ADC64rr killed renamable $rcx, killed renamable $r11, implicit-def $eflags, implicit killed $eflags
+    renamable $al = SETCCr 2, implicit killed $eflags
+    renamable $r11d = MOVZX32rr8 killed renamable $al, implicit-def $r11
+    $rax = MOV64rr killed $rdi
+    MUL64r killed renamable $r14, implicit-def $rax, implicit-def $rdx, implicit-def dead $eflags, implicit $rax
+    renamable $rax = ADD64rr killed renamable $rax, killed renamable $rcx, implicit-def $eflags
+    renamable $rdx = ADC64rr killed renamable $rdx, killed renamable $r11, implicit-def dead $eflags, implicit killed $eflags
+    renamable $rax = ADD64rr killed renamable $rax, killed renamable $r10, implicit-def $eflags
+    renamable $rdx = ADC64rr killed renamable $rdx, killed renamable $r15, implicit-def dead $eflags, implicit killed $eflags
+    MOV64mr renamable $r8, 1, $noreg, 0, $noreg, killed renamable $rsi :: (store (s64) into %ir.out, align 16)
+    MOV64mr renamable $r8, 1, $noreg, 8, $noreg, killed renamable $r9 :: (store (s64) into %ir.out + 8, basealign 16)
+    MOV64mr renamable $r8, 1, $noreg, 16, $noreg, killed renamable $rax :: (store (s64) into %ir.out + 16, align 16)
+    MOV64mr killed renamable $r8, 1, $noreg, 24, $noreg, killed renamable $rdx :: (store (s64) into %ir.out + 24, basealign 16)
+    SEH_BeginEpilogue
+    $rsp = frame-destroy ADD64ri32 $rsp, 8, implicit-def dead $eflags
+    $rbx = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    $rdi = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    $rsi = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    $r14 = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    $r15 = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    SEH_EndEpilogue
+    RET64
+...
diff --git a/llvm/test/CodeGen/X86/win64-eh-unwindv2.ll b/llvm/test/CodeGen/X86/win64-eh-unwindv2.ll
index a9fd1b9..0d92d04 100644
--- a/llvm/test/CodeGen/X86/win64-eh-unwindv2.ll
+++ b/llvm/test/CodeGen/X86/win64-eh-unwindv2.ll
@@ -152,9 +152,63 @@ entry:
 ; CHECK-NEXT:   retq
 ; CHECK-NEXT:   .seh_endproc
 
+define dso_local void @large_aligned_alloc() align 16 {
+  %1 = alloca [128 x i8], align 64
+  ret void
+}
+; CHECK-LABEL:  large_aligned_alloc:
+; CHECK:        .seh_unwindversion 2
+; CHECK:        .seh_pushreg %rbp
+; CHECK:        .seh_stackalloc 176
+; CHECK:        .seh_setframe %rbp, 128
+; CHECK:        .seh_endprologue
+; CHECK-NOT:    .seh_endproc
+; CHECK:        .seh_startepilogue
+; CHECK-NEXT:   leaq    48(%rbp), %rsp
+; CHECK-NEXT:   .seh_unwindv2start
+; CHECK-NEXT:   popq    %rbp
+; CHECK-NEXT:   .seh_endepilogue
+; CHECK-NEXT:   retq
+; CHECK-NEXT:   .seh_endproc
+
+define dso_local void @set_frame_only() local_unnamed_addr {
+  tail call i64 @llvm.x86.flags.read.u64()
+  ret void
+}
+
+; CHECK-LABEL:  set_frame_only:
+; CHECK:        .seh_unwindversion 2
+; CHECK:        .seh_pushreg %rbp
+; CHECK:        .seh_setframe %rbp, 0
+; CHECK:        .seh_endprologue
+; CHECK-NOT:    .seh_endproc
+; CHECK:        .seh_startepilogue
+; CHECK-NEXT:   .seh_unwindv2start
+; CHECK-NEXT:   popq    %rbp
+; CHECK-NEXT:   .seh_endepilogue
+; CHECK-NEXT:   retq
+; CHECK-NEXT:   .seh_endproc
+
+attributes #1 = { noreturn }
+define dso_local void @no_return_func() local_unnamed_addr #1 {
+entry:
+  call void @d()
+  unreachable
+}
+; CHECK-LABEL:  no_return_func:
+; CHECK-NOT:    .seh_unwindversion 2
+; CHECK:        .seh_stackalloc
+; CHECK-NEXT:   .seh_endprologue
+; CHECK-NOT:    .seh_startepilogue
+; CHECK-NOT:    .seh_unwindv2start
+; CHECK:        int3
+; CHECK-NEXT:   .seh_endproc
+
+declare i64 @llvm.x86.flags.read.u64()
 declare void @a() local_unnamed_addr
 declare i32 @b() local_unnamed_addr
 declare i32 @c(i32) local_unnamed_addr
+declare void @d() local_unnamed_addr #1
 
 !llvm.module.flags = !{!0}
-!0 = !{i32 1, !"winx64-eh-unwindv2", i32 1}
+!0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
diff --git a/llvm/test/CodeGen/X86/win64-stackprobe-overflow.ll b/llvm/test/CodeGen/X86/win64-stackprobe-overflow.ll
index 9555ce0..732fc65 100644
--- a/llvm/test/CodeGen/X86/win64-stackprobe-overflow.ll
+++ b/llvm/test/CodeGen/X86/win64-stackprobe-overflow.ll
@@ -10,5 +10,5 @@ start:
 attributes #0 = { nonlazybind uwtable "probe-stack"="probe_stack" "target-cpu"="x86-64" }
 
 ; CHECK-LABEL: foo:
-; CHECK: movabsq $4294967304, %rax
+; CHECK: movabsq $4294967312, %rax
 ; CHECK-NEXT: callq probe_stack
diff --git a/llvm/test/CodeGen/X86/zero-call-used-regs-debug-info.mir b/llvm/test/CodeGen/X86/zero-call-used-regs-debug-info.mir
index 35eeede..be022e3 100644
--- a/llvm/test/CodeGen/X86/zero-call-used-regs-debug-info.mir
+++ b/llvm/test/CodeGen/X86/zero-call-used-regs-debug-info.mir
@@ -117,8 +117,8 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   hasTailCall:     false
   localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
+  savePoint:       []
+  restorePoint:    []
 fixedStack:      []
 stack:           []
 callSites:       []